import nltk
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
import plotly.express as px
import plotly.graph_objects as go
import scipy.stats as stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
import missingno as msno
import os
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
nltk.download("punkt")
nltk.download('stopwords')
nltk.download('wordnet')
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from collections import Counter
from wordcloud import WordCloud
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, f1_score, recall_score
from sklearn.metrics import precision_recall_curve, average_precision_score
from yellowbrick.classifier import PrecisionRecallCurve
import nltk
nltk.download("punkt")
nltk.download('stopwords')
nltk.download('wordnet')
# Importing plotly and cufflinks in offline mode
import cufflinks as cf
import plotly.express as px
import plotly.offline
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)
# Ignore Warnings
import warnings
warnings.filterwarnings("ignore")
warnings.warn("this will not show")
# Figure&Display options
%matplotlib inline
fig, ax = plt.subplots()
# fig.set_size_inches(10, 6)
plt.rcParams["figure.figsize"] = (12, 8) # the size of A4 paper use (11.7, 8.27)
pd.set_option('max_colwidth', 200)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 200)
pd.set_option('display.float_format', lambda x: '%.2f' % x)
# !pip install termcolor
import colorama
from colorama import Fore, Style # maakes strings colored
from termcolor import colored
import ipywidgets
from ipywidgets import interact
# !pip install -U pandas-profiling --user
# !pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip
import pandas_profiling
from pandas_profiling.report.presentation.flavours.html.templates import create_html_assets
[nltk_data] Downloading package punkt to [nltk_data] C:\Users\aksha\AppData\Roaming\nltk_data... [nltk_data] Package punkt is already up-to-date! [nltk_data] Downloading package stopwords to [nltk_data] C:\Users\aksha\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package wordnet to [nltk_data] C:\Users\aksha\AppData\Roaming\nltk_data... [nltk_data] Package wordnet is already up-to-date! [nltk_data] Downloading package punkt to [nltk_data] C:\Users\aksha\AppData\Roaming\nltk_data... [nltk_data] Package punkt is already up-to-date! [nltk_data] Downloading package stopwords to [nltk_data] C:\Users\aksha\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package wordnet to [nltk_data] C:\Users\aksha\AppData\Roaming\nltk_data... [nltk_data] Package wordnet is already up-to-date!
--------------------------------------------------------------------------- PydanticImportError Traceback (most recent call last) Cell In[5], line 77 74 # !pip install -U pandas-profiling --user 75 # !pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip 76 import pandas_profiling ---> 77 from pandas_profiling.report.presentation.flavours.html.templates import create_html_assets File ~\AppData\Roaming\Python\Python311\site-packages\pandas_profiling\report\__init__.py:2 1 """All functionality concerned with presentation to the user.""" ----> 2 from pandas_profiling.report.structure.report import get_report_structure 4 __all__ = ["get_report_structure"] File ~\AppData\Roaming\Python\Python311\site-packages\pandas_profiling\report\structure\report.py:7 4 import pandas as pd 5 from tqdm.auto import tqdm ----> 7 from pandas_profiling.config import Settings 8 from pandas_profiling.model.alerts import AlertType 9 from pandas_profiling.model.handler import get_render_map File ~\AppData\Roaming\Python\Python311\site-packages\pandas_profiling\config.py:5 2 from enum import Enum 3 from typing import Any, Dict, List, Optional ----> 5 from pydantic import BaseModel, BaseSettings, Field 8 def _merge_dictionaries(dict1: dict, dict2: dict) -> dict: 9 """ 10 Recursive merge dictionaries. 11 (...) 14 :return: Merged dictionary 15 """ File ~\AppData\Roaming\Python\Python311\site-packages\pydantic\__init__.py:386, in __getattr__(attr_name) 384 dynamic_attr = _dynamic_imports.get(attr_name) 385 if dynamic_attr is None: --> 386 return _getattr_migration(attr_name) 388 package, module_name = dynamic_attr 390 from importlib import import_module File ~\AppData\Roaming\Python\Python311\site-packages\pydantic\_migration.py:296, in getattr_migration.<locals>.wrapper(name) 294 return import_string(REDIRECT_TO_V1[import_path]) 295 if import_path == 'pydantic:BaseSettings': --> 296 raise PydanticImportError( 297 '`BaseSettings` has been moved to the `pydantic-settings` package. ' 298 f'See https://docs.pydantic.dev/{version_short()}/migration/#basesettings-has-moved-to-pydantic-settings ' 299 'for more details.' 300 ) 301 if import_path in REMOVED_IN_V2: 302 raise PydanticImportError(f'`{import_path}` has been removed in V2.') PydanticImportError: `BaseSettings` has been moved to the `pydantic-settings` package. See https://docs.pydantic.dev/2.7/migration/#basesettings-has-moved-to-pydantic-settings for more details. For further information visit https://errors.pydantic.dev/2.7/u/import-error
def missing_values(df):
missing_number = df.isnull().sum().sort_values(ascending=False)
missing_percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
missing_values = pd.concat([missing_number, missing_percent], axis=1, keys=['Missing_Number', 'Missing_Percent'])
return missing_values[missing_values['Missing_Number']>0]
def first_looking(df):
print(colored("Shape:", attrs=['bold']), df.shape,'\n',
colored('-'*79, 'red', attrs=['bold']),
colored("\nInfo:\n", attrs=['bold']), sep='')
print(df.info(), '\n',
colored('-'*79, 'red', attrs=['bold']), sep='')
print(colored("Number of Uniques:\n", attrs=['bold']), df.nunique(),'\n',
colored('-'*79, 'red', attrs=['bold']), sep='')
print(colored("Missing Values:\n", attrs=['bold']), missing_values(df),'\n',
colored('-'*79, 'red', attrs=['bold']), sep='')
print(colored("All Columns:", attrs=['bold']), list(df.columns),'\n',
colored('-'*79, 'red', attrs=['bold']), sep='')
df.columns= df.columns.str.lower().str.replace('&', '_').str.replace(' ', '_')
print(colored("Columns after rename:", attrs=['bold']), list(df.columns),'\n',
colored('-'*79, 'red', attrs=['bold']), sep='')
def multicolinearity_control(df):
feature =[]
collinear=[]
for col in df.corr().columns:
for i in df.corr().index:
if (abs(df.corr()[col][i])> .9 and abs(df.corr()[col][i]) < 1):
feature.append(col)
collinear.append(i)
print(colored(f"Multicolinearity alert in between:{col} - {i}",
"red", attrs=['bold']), df.shape,'\n',
colored('-'*79, 'red', attrs=['bold']), sep='')
def duplicate_values(df):
print(colored("Duplicate check...", attrs=['bold']), sep='')
duplicate_values = df.duplicated(subset=None, keep='first').sum()
if duplicate_values > 0:
df.drop_duplicates(keep='first', inplace=True)
print(duplicate_values, colored("Duplicates were dropped!"),'\n',
colored('-'*79, 'red', attrs=['bold']), sep='')
else:
print(colored("There are no duplicates"),'\n',
colored('-'*79, 'red', attrs=['bold']), sep='')
def drop_columns(df, drop_columns):
if drop_columns !=[]:
df.drop(drop_columns, axis=1, inplace=True)
print(drop_columns, 'were dropped')
else:
print(colored('We will now check the missing values and if necessary will drop realted columns!', attrs=['bold']),'\n',
colored('-'*79, 'red', attrs=['bold']), sep='')
def drop_null(df, limit):
print('Shape:', df.shape)
for i in df.isnull().sum().index:
if (df.isnull().sum()[i]/df.shape[0]*100)>limit:
print(df.isnull().sum()[i], 'percent of', i ,'null and were dropped')
df.drop(i, axis=1, inplace=True)
print('new shape:', df.shape)
print('New shape after missing value control:', df.shape)
###############################################################################
# To view summary information about the column
def first_look(col):
print("column name : ", col)
print("--------------------------------")
print("per_of_nulls : ", "%", round(df[col].isnull().sum()/df.shape[0]*100, 2))
print("num_of_nulls : ", df[col].isnull().sum())
print("num_of_uniques : ", df[col].nunique())
print(df[col].value_counts(dropna = False))
os.chdir('C:\\Users\\aksha\\OneDrive\\Desktop\\Placement\\Self Project\\NLP')
df0=pd.read_csv('Womens Clothing E-Commerce Reviews.csv')
df = df0.copy()
df.head()
| Unnamed: 0 | Clothing ID | Age | Title | Review Text | Rating | Recommended IND | Positive Feedback Count | Division Name | Department Name | Class Name | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 767 | 33 | NaN | Absolutely wonderful - silky and sexy and comfortable | 4 | 1 | 0 | Initmates | Intimate | Intimates |
| 1 | 1 | 1080 | 34 | NaN | Love this dress! it's sooo pretty. i happened to find it in a store, and i'm glad i did bc i never would have ordered it online bc it's petite. i bought a petite and am 5'8". i love the length... | 5 | 1 | 4 | General | Dresses | Dresses |
| 2 | 2 | 1077 | 60 | Some major design flaws | I had such high hopes for this dress and really wanted it to work for me. i initially ordered the petite small (my usual size) but i found this to be outrageously small. so small in fact that i co... | 3 | 0 | 0 | General | Dresses | Dresses |
| 3 | 3 | 1049 | 50 | My favorite buy! | I love, love, love this jumpsuit. it's fun, flirty, and fabulous! every time i wear it, i get nothing but great compliments! | 5 | 1 | 0 | General Petite | Bottoms | Pants |
| 4 | 4 | 847 | 47 | Flattering shirt | This shirt is very flattering to all due to the adjustable front tie. it is the perfect length to wear with leggings and it is sleeveless so it pairs well with any cardigan. love this shirt!!! | 5 | 1 | 6 | General | Tops | Blouses |
df.profile_report()
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
first_looking(df)
Shape:(23486, 11) ------------------------------------------------------------------------------- Info: <class 'pandas.core.frame.DataFrame'> RangeIndex: 23486 entries, 0 to 23485 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 23486 non-null int64 1 Clothing ID 23486 non-null int64 2 Age 23486 non-null int64 3 Title 19676 non-null object 4 Review Text 22641 non-null object 5 Rating 23486 non-null int64 6 Recommended IND 23486 non-null int64 7 Positive Feedback Count 23486 non-null int64 8 Division Name 23472 non-null object 9 Department Name 23472 non-null object 10 Class Name 23472 non-null object dtypes: int64(6), object(5) memory usage: 2.0+ MB None ------------------------------------------------------------------------------- Number of Uniques: Unnamed: 0 23486 Clothing ID 1206 Age 77 Title 13993 Review Text 22634 Rating 5 Recommended IND 2 Positive Feedback Count 82 Division Name 3 Department Name 6 Class Name 20 dtype: int64 ------------------------------------------------------------------------------- Missing Values: Missing_Number Missing_Percent Title 3810 0.16 Review Text 845 0.04 Division Name 14 0.00 Department Name 14 0.00 Class Name 14 0.00 ------------------------------------------------------------------------------- All Columns:['Unnamed: 0', 'Clothing ID', 'Age', 'Title', 'Review Text', 'Rating', 'Recommended IND', 'Positive Feedback Count', 'Division Name', 'Department Name', 'Class Name'] ------------------------------------------------------------------------------- Columns after rename:['unnamed:_0', 'clothing_id', 'age', 'title', 'review_text', 'rating', 'recommended_ind', 'positive_feedback_count', 'division_name', 'department_name', 'class_name'] -------------------------------------------------------------------------------
df.head(1)
| unnamed:_0 | clothing_id | age | title | review_text | rating | recommended_ind | positive_feedback_count | division_name | department_name | class_name | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 767 | 33 | NaN | Absolutely wonderful - silky and sexy and comfortable | 4 | 1 | 0 | Initmates | Intimate | Intimates |
df.sample(3)
| unnamed:_0 | clothing_id | age | title | review_text | rating | recommended_ind | positive_feedback_count | division_name | department_name | class_name | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 8339 | 8339 | 1098 | 42 | NaN | NaN | 5 | 1 | 0 | General | Dresses | Dresses |
| 1615 | 1615 | 825 | 45 | Perfect | Received this shirt yesterday and loved it. doesn't look that great online in pics, but it is a unique and beautiful top. the material is a substantial woven cotton so it lays nicely and keeps its... | 5 | 1 | 2 | General Petite | Tops | Blouses |
| 7289 | 7289 | 862 | 28 | Date night! | I'm always weary of v necks but this shirt is great. the fit is true to size and while it is a v neck, it's not such a deep plunge that you need a cami underneath. great material too and perfect f... | 5 | 1 | 0 | General | Tops | Knits |
df.shape
(23486, 11)
df.drop("unnamed:_0", axis=1, inplace=True)
df.head(1)
| clothing_id | age | title | review_text | rating | recommended_ind | positive_feedback_count | division_name | department_name | class_name | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 767 | 33 | NaN | Absolutely wonderful - silky and sexy and comfortable | 4 | 1 | 0 | Initmates | Intimate | Intimates |
df.describe().T.style.background_gradient(subset=['mean','std','50%','count'], cmap='RdPu').format('{:.2f}')
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| clothing_id | 23486.00 | 918.12 | 203.30 | 0.00 | 861.00 | 936.00 | 1078.00 | 1205.00 |
| age | 23486.00 | 43.20 | 12.28 | 18.00 | 34.00 | 41.00 | 52.00 | 99.00 |
| rating | 23486.00 | 4.20 | 1.11 | 1.00 | 4.00 | 5.00 | 5.00 | 5.00 |
| recommended_ind | 23486.00 | 0.82 | 0.38 | 0.00 | 1.00 | 1.00 | 1.00 | 1.00 |
| positive_feedback_count | 23486.00 | 2.54 | 5.70 | 0.00 | 0.00 | 1.00 | 3.00 | 122.00 |
df.describe(include=object).T
| count | unique | top | freq | |
|---|---|---|---|---|
| title | 19676 | 13993 | Love it! | 136 |
| review_text | 22641 | 22634 | Perfect fit and i've gotten so many compliments. i buy all my suits from here now! | 3 |
| division_name | 23472 | 3 | General | 13850 |
| department_name | 23472 | 6 | Tops | 10468 |
| class_name | 23472 | 20 | Dresses | 6319 |
# to find how many unique values numerical features have
for col in df.select_dtypes(include=[np.number]).columns:
print(colored(f"{col}", 'green', attrs=['bold']), f"feature has", colored(f"{df[col].nunique()}", 'green', attrs=['bold']), f"unique values.")
clothing_id feature has 1206 unique values. age feature has 77 unique values. rating feature has 5 unique values. recommended_ind feature has 2 unique values. positive_feedback_count feature has 82 unique values.
# to find how many unique values object features have
for col in df.select_dtypes(include="object").columns:
print(colored(f"{col}", 'green', attrs=['bold']), f"feature has", colored(f"{df[col].nunique()}", 'green', attrs=['bold']), f"unique values.")
title feature has 13993 unique values. review_text feature has 22634 unique values. division_name feature has 3 unique values. department_name feature has 6 unique values. class_name feature has 20 unique values.
df.columns
Index(['clothing_id', 'age', 'title', 'review_text', 'rating',
'recommended_ind', 'positive_feedback_count', 'division_name',
'department_name', 'class_name'],
dtype='object')
df.head(2)
| clothing_id | age | title | review_text | rating | recommended_ind | positive_feedback_count | division_name | department_name | class_name | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 767 | 33 | NaN | Absolutely wonderful - silky and sexy and comfortable | 4 | 1 | 0 | Initmates | Intimate | Intimates |
| 1 | 1080 | 34 | NaN | Love this dress! it's sooo pretty. i happened to find it in a store, and i'm glad i did bc i never would have ordered it online bc it's petite. i bought a petite and am 5'8". i love the length... | 5 | 1 | 4 | General | Dresses | Dresses |
df["recommended_ind"].value_counts()
recommended_ind 1 19314 0 4172 Name: count, dtype: int64
df["recommended_ind"].value_counts()
recommended_ind 1 19314 0 4172 Name: count, dtype: int64
first_look("recommended_ind")
column name : recommended_ind -------------------------------- per_of_nulls : % 0.0 num_of_nulls : 0 num_of_uniques : 2 recommended_ind 1 19314 0 4172 Name: count, dtype: int64
df["recommended_ind"].describe().T
count 23486.00 mean 0.82 std 0.38 min 0.00 25% 1.00 50% 1.00 75% 1.00 max 1.00 Name: recommended_ind, dtype: float64
sns.countplot(x = df.recommended_ind, data = df)
plt.title('Customer Recommendation Distribution', fontsize=30)
plt.xlabel("Recommendation Label", fontsize=24)
plt.ylabel("The Number of Recommendations", fontsize=24)
for index,value in enumerate(df.recommended_ind.value_counts().sort_values()):
plt.text(index, value, f"{value}", ha="center", va="bottom", fontsize = 13);
plt.figure(figsize=(8, 8))
explode = [0, 0.1]
plt.pie(df['recommended_ind'].value_counts(), explode=explode, autopct='%1.1f%%', shadow=True, startangle=140)
plt.legend(labels=['1', '0'])
plt.title('Customer Recommendation Distribution', fontsize=20)
plt.axis('off');
df["rating"].value_counts()
rating 5 13131 4 5077 3 2871 2 1565 1 842 Name: count, dtype: int64
first_look("rating")
column name : rating -------------------------------- per_of_nulls : % 0.0 num_of_nulls : 0 num_of_uniques : 5 rating 5 13131 4 5077 3 2871 2 1565 1 842 Name: count, dtype: int64
df["rating"].describe().T
count 23486.00 mean 4.20 std 1.11 min 1.00 25% 4.00 50% 5.00 75% 5.00 max 5.00 Name: rating, dtype: float64
sns.countplot(x = df.rating, data = df)
plt.title('Customer Rating Distribution', fontsize=30)
plt.xlabel("Rating Label", fontsize=24)
plt.ylabel("The Number of Rating", fontsize=24)
for index,value in enumerate(df.rating.value_counts().sort_values()):
plt.text(index, value, f"{value}", ha="center", va="bottom", fontsize = 13);
plt.figure(figsize=(8, 8))
explode = [0.1, 0, 0, 0, 0.1]
plt.pie(df['rating'].value_counts(), explode=explode, autopct='%1.1f%%', shadow=True, startangle=140)
plt.legend(labels=['1', '2', '3', '4','5'])
plt.title('Customer Rating Distribution', fontsize=20)
plt.axis('off');
df["age"].value_counts()
age 39 1269 35 909 36 842 34 804 38 780 37 766 41 741 33 725 46 713 42 651 32 631 48 626 40 617 44 617 43 579 31 569 47 564 53 560 45 529 29 513 49 490 56 471 52 442 28 428 26 423 30 407 50 398 54 395 51 393 57 363 27 344 60 341 62 338 25 331 55 328 59 321 58 277 66 276 64 247 23 247 24 245 63 239 61 227 65 226 67 157 22 146 68 141 69 113 20 108 21 102 70 93 71 51 74 50 72 46 83 43 19 40 73 40 75 26 77 18 78 15 79 15 82 13 76 10 80 10 85 6 84 6 89 5 81 5 91 5 18 4 87 4 94 3 93 2 90 2 86 2 99 2 92 1 Name: count, dtype: int64
first_look("age")
column name : age -------------------------------- per_of_nulls : % 0.0 num_of_nulls : 0 num_of_uniques : 77 age 39 1269 35 909 36 842 34 804 38 780 37 766 41 741 33 725 46 713 42 651 32 631 48 626 40 617 44 617 43 579 31 569 47 564 53 560 45 529 29 513 49 490 56 471 52 442 28 428 26 423 30 407 50 398 54 395 51 393 57 363 27 344 60 341 62 338 25 331 55 328 59 321 58 277 66 276 64 247 23 247 24 245 63 239 61 227 65 226 67 157 22 146 68 141 69 113 20 108 21 102 70 93 71 51 74 50 72 46 83 43 19 40 73 40 75 26 77 18 78 15 79 15 82 13 76 10 80 10 85 6 84 6 89 5 81 5 91 5 18 4 87 4 94 3 93 2 90 2 86 2 99 2 92 1 Name: count, dtype: int64
df["age"].describe().T
count 23486.00 mean 43.20 std 12.28 min 18.00 25% 34.00 50% 41.00 75% 52.00 max 99.00 Name: age, dtype: float64
plt.figure(figsize = (20, 8))
plt.title('Customer Age Distribution', fontsize=30)
plt.xlabel("Age", fontsize=24)
plt.ylabel("The Number of Customer Age", fontsize=18)
sns.histplot(df, x='age', kde = True, bins = 50);
fig_dims = (30, 10)
fig, ax = plt.subplots(figsize=fig_dims)
sns.barplot(data = df, x = "age", y = "recommended_ind", ax=ax)
plt.title('Customer Age Distribution By Recommendations', fontsize=30)
plt.xlabel("Age", fontsize=24)
plt.ylabel("Customer Recommendation Ratio", fontsize=18)
plt.xticks(rotation = 45);
fig = px.histogram(df['age'], color=df['recommended_ind'],
labels={'value': 'Age', 'color': 'Recommended'},
color_discrete_map={0: "magenta", 1: "MediumPurple"},
marginal='box')
fig.update_traces(marker=dict(line=dict(color='#000000', width=2)))
fig.update_layout(title_text='Distribution of the Age and Recommendation',
title_x=0.5, title_font=dict(size=20))
fig.update_layout(barmode='overlay')
fig.show()
df["positive_feedback_count"].value_counts()
positive_feedback_count 0 11176 1 4043 2 2193 3 1433 4 922 5 673 6 525 7 374 8 319 9 261 10 225 11 178 12 146 14 121 13 102 15 94 17 81 16 74 18 62 19 54 20 40 23 31 21 30 22 29 25 25 28 24 26 23 24 21 27 20 30 18 31 17 29 15 32 10 42 9 37 9 38 8 34 7 35 6 36 6 33 6 51 5 81 4 45 4 39 4 65 4 43 4 40 3 47 3 57 3 41 3 44 3 53 2 49 2 46 2 58 2 55 2 64 1 89 1 68 1 56 1 48 1 98 1 93 1 95 1 50 1 99 1 87 1 69 1 66 1 54 1 108 1 122 1 52 1 78 1 82 1 61 1 77 1 94 1 117 1 71 1 84 1 59 1 Name: count, dtype: int64
first_look("positive_feedback_count")
column name : positive_feedback_count -------------------------------- per_of_nulls : % 0.0 num_of_nulls : 0 num_of_uniques : 82 positive_feedback_count 0 11176 1 4043 2 2193 3 1433 4 922 5 673 6 525 7 374 8 319 9 261 10 225 11 178 12 146 14 121 13 102 15 94 17 81 16 74 18 62 19 54 20 40 23 31 21 30 22 29 25 25 28 24 26 23 24 21 27 20 30 18 31 17 29 15 32 10 42 9 37 9 38 8 34 7 35 6 36 6 33 6 51 5 81 4 45 4 39 4 65 4 43 4 40 3 47 3 57 3 41 3 44 3 53 2 49 2 46 2 58 2 55 2 64 1 89 1 68 1 56 1 48 1 98 1 93 1 95 1 50 1 99 1 87 1 69 1 66 1 54 1 108 1 122 1 52 1 78 1 82 1 61 1 77 1 94 1 117 1 71 1 84 1 59 1 Name: count, dtype: int64
df["positive_feedback_count"].describe().T
count 23486.00 mean 2.54 std 5.70 min 0.00 25% 0.00 50% 1.00 75% 3.00 max 122.00 Name: positive_feedback_count, dtype: float64
plt.figure(figsize = (20, 8))
plt.title('Customer Positive Feedback Distribution', fontsize=20)
plt.xlabel("Customer Positive Feedback", fontsize=24)
plt.ylabel("The Number of Customer Positive Feedback", fontsize=18)
sns.histplot(df, x='positive_feedback_count', kde = True, bins = 50);
fig_dims = (30, 10)
fig, ax = plt.subplots(figsize=fig_dims)
sns.barplot(data = df, x = "positive_feedback_count", y = "age", ax=ax)
plt.xlabel("Customer Positive Feedback", fontsize=24)
plt.ylabel("Age", fontsize=24)
plt.title('Customer Positive Feedback Distribution By Age', fontsize=30)
plt.xticks(rotation = 45);
df.columns
Index(['clothing_id', 'age', 'title', 'review_text', 'rating',
'recommended_ind', 'positive_feedback_count', 'division_name',
'department_name', 'class_name'],
dtype='object')
df["division_name"].value_counts()
division_name General 13850 General Petite 8120 Initmates 1502 Name: count, dtype: int64
first_look("division_name")
column name : division_name -------------------------------- per_of_nulls : % 0.06 num_of_nulls : 14 num_of_uniques : 3 division_name General 13850 General Petite 8120 Initmates 1502 NaN 14 Name: count, dtype: int64
df["division_name"].describe().T
count 23472 unique 3 top General freq 13850 Name: division_name, dtype: object
g = sns.catplot( x='division_name',
kind="count",
data=df,
height=5,
aspect=2)
plt.title('Division Distribution', fontsize=24)
plt.xlabel("Division Name", fontsize=24)
plt.ylabel("The Number of Divisions", fontsize=20)
ax = g.facet_axis(0, 0)
for p in ax.patches:
ax.text(p.get_x() + 0.28,
p.get_height() * 1.025,
'{0:.0f}'.format(p.get_height()),
color='black', rotation='horizontal', size='large')
plt.show()
plt.figure(figsize=(8, 8))
explode = [0.1, 0.1, 0]
plt.pie(df['division_name'].value_counts(), explode=explode, autopct='%1.1f%%', shadow=True, startangle=140)
plt.legend(labels=['1', '2', '3'])
plt.title('Division Distribution', fontsize=20)
plt.axis('off');
g = sns.catplot(data = df, x ="division_name", hue = "recommended_ind", kind='count', height=5, aspect=2, legend_out=False)
plt.title('Division Distribution By Recommendation', fontsize=24)
plt.xlabel("Division Name By Recommendation", fontsize=20)
plt.ylabel("The Number of Divisions", fontsize=20)
plt.legend(title='Recommendation Indicator', loc='upper left', labels=['Not Recomnended', 'Recomnended'])
ax = g.facet_axis(0, 0)
for p in ax.patches:
ax.text(p.get_x() + 0.12,
p.get_height() * 1.025,
'{0:.0f}'.format(p.get_height()),
color='black', rotation='horizontal', size='large')
plt.show()
g = sns.catplot(data = df, x ="rating", hue = "division_name", kind='count', height=5, aspect=2, legend_out=False)
plt.title('Rating Distribution By Division', fontsize=24)
plt.xlabel("Ratings By Division", fontsize=20)
plt.ylabel("The Number of Ratings", fontsize=20)
plt.legend(title='Division Name', loc='upper left', labels=['Intimates', 'General', 'General Petite'])
ax = g.facet_axis(0, 0)
for p in ax.patches:
ax.text(p.get_x() + 0.04,
p.get_height() * 1.025,
'{0:.0f}'.format(p.get_height()),
color='black', rotation='horizontal', size='large')
plt.show()
df["department_name"].value_counts()
department_name Tops 10468 Dresses 6319 Bottoms 3799 Intimate 1735 Jackets 1032 Trend 119 Name: count, dtype: int64
first_look("department_name")
column name : department_name -------------------------------- per_of_nulls : % 0.06 num_of_nulls : 14 num_of_uniques : 6 department_name Tops 10468 Dresses 6319 Bottoms 3799 Intimate 1735 Jackets 1032 Trend 119 NaN 14 Name: count, dtype: int64
df["department_name"].describe().T
count 23472 unique 6 top Tops freq 10468 Name: department_name, dtype: object
g = sns.catplot(data = df, x ="department_name", kind='count', height=5, aspect=2)
plt.title('Department Distribution', fontsize=26)
plt.xlabel("Department Name", fontsize=20)
plt.ylabel("The Number of Departments", fontsize=20)
ax = g.facet_axis(0, 0)
for p in ax.patches:
ax.text(p.get_x() + 0.28,
p.get_height() * 1.025,
'{0:.0f}'.format(p.get_height()),
color='black', rotation='horizontal', size='large')
plt.show()
plt.figure(figsize=(8, 8))
explode = [0.1, 0, 0, 0, 0, 0]
plt.pie(df['department_name'].value_counts(), explode=explode, autopct='%1.1f%%', shadow=True, startangle=140)
plt.legend(labels=['Tops', 'Dresses', 'Bottoms', 'Intimate', 'Jackets', 'Trend'])
plt.title('Department Distribution', fontsize=20)
plt.axis('off');
g = sns.catplot(data = df, x ="department_name", hue = "recommended_ind", kind='count', height=7, aspect=2.5, legend_out=False)
plt.title('Department Distribution By Recommendation', fontsize=26)
plt.xlabel("Department Name", fontsize=20)
plt.ylabel("The Number of Recommendations", fontsize=20)
plt.legend(title='Recommendation Indicator', loc='upper left', labels=['Not Recomnended', 'Recomnended'], fontsize='x-large', title_fontsize='24')
ax = g.facet_axis(0, 0)
for p in ax.patches:
ax.text(p.get_x() + 0.12,
p.get_height() * 1.025,
'{0:.0f}'.format(p.get_height()),
color='black', rotation='horizontal', size='large')
plt.show()
g = sns.catplot(data = df, x ="rating", hue = "department_name", kind='count', height=10, aspect=2.5, legend_out=False)
plt.title('Department Distribution By Recommendation', fontsize=26)
plt.xlabel("Department Name", fontsize=20)
plt.ylabel("The Number of Recommendations", fontsize=20)
plt.legend(title='Department Name', loc='upper left', labels=['Intimates', 'Dresses', 'Bottoms', 'Tops', 'Jackets', 'Trend'], fontsize='x-large', title_fontsize='24')
plt.figure(figsize=(15, 8))
ax = g.facet_axis(0, 0)
for p in ax.patches:
ax.text(p.get_x() + 0.025,
p.get_height() * 1.025,
'{0:.0f}'.format(p.get_height()),
color='black', rotation='horizontal', size='large')
plt.show()
<Figure size 1500x800 with 0 Axes>
df["class_name"].value_counts()
class_name Dresses 6319 Knits 4843 Blouses 3097 Sweaters 1428 Pants 1388 Jeans 1147 Fine gauge 1100 Skirts 945 Jackets 704 Lounge 691 Swim 350 Outerwear 328 Shorts 317 Sleep 228 Legwear 165 Intimates 154 Layering 146 Trend 119 Casual bottoms 2 Chemises 1 Name: count, dtype: int64
first_look("class_name")
column name : class_name -------------------------------- per_of_nulls : % 0.06 num_of_nulls : 14 num_of_uniques : 20 class_name Dresses 6319 Knits 4843 Blouses 3097 Sweaters 1428 Pants 1388 Jeans 1147 Fine gauge 1100 Skirts 945 Jackets 704 Lounge 691 Swim 350 Outerwear 328 Shorts 317 Sleep 228 Legwear 165 Intimates 154 Layering 146 Trend 119 NaN 14 Casual bottoms 2 Chemises 1 Name: count, dtype: int64
df["class_name"].describe().T
count 23472 unique 20 top Dresses freq 6319 Name: class_name, dtype: object
plt.title('Product Class Distribution', fontsize=25)
df["class_name"].value_counts().plot(kind="pie", autopct='%1.1f%%', figsize=(16, 16));
g = sns.catplot(data = df, x ="department_name", hue = "rating", kind='count', height=10, aspect=2.5)
sns.set(rc = {'figure.figsize':(30, 12)})
plt.title('Department Distribution By Rating', fontsize=30)
plt.xlabel("Department Name", fontsize=24)
plt.ylabel("The Number of Ratings", fontsize=24)
plt.legend(title='Rating Label', loc='upper left', labels=['1', '2', '3', '4', '5'], fontsize='x-large', title_fontsize='24')
ax = g.facet_axis(0, 0)
for p in ax.patches:
ax.text(p.get_x() + 0.02,
p.get_height() * 1.025,
'{0:.0f}'.format(p.get_height()),
color='black', rotation='horizontal', size='large', fontsize = 18)
plt.show()
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) Cell In[63], line 11 9 ax = g.facet_axis(0, 0) 10 for p in ax.patches: ---> 11 ax.text(p.get_x() + 0.02, 12 p.get_height() * 1.025, 13 '{0:.0f}'.format(p.get_height()), 14 color='black', rotation='horizontal', size='large', fontsize = 18) 16 plt.show() File C:\ProgramData\anaconda3\Lib\site-packages\matplotlib\axes\_axes.py:689, in Axes.text(self, x, y, s, fontdict, **kwargs) 628 """ 629 Add text to the Axes. 630 (...) 679 >>> text(x, y, s, bbox=dict(facecolor='red', alpha=0.5)) 680 """ 681 effective_kwargs = { 682 'verticalalignment': 'baseline', 683 'horizontalalignment': 'left', (...) 687 **kwargs, 688 } --> 689 t = mtext.Text(x, y, text=s, **effective_kwargs) 690 t.set_clip_path(self.patch) 691 self._add_text(t) File C:\ProgramData\anaconda3\Lib\site-packages\matplotlib\_api\deprecation.py:454, in make_keyword_only.<locals>.wrapper(*args, **kwargs) 448 if len(args) > name_idx: 449 warn_deprecated( 450 since, message="Passing the %(name)s %(obj_type)s " 451 "positionally is deprecated since Matplotlib %(since)s; the " 452 "parameter will become keyword-only %(removal)s.", 453 name=name, obj_type=f"parameter of {func.__name__}()") --> 454 return func(*args, **kwargs) File C:\ProgramData\anaconda3\Lib\site-packages\matplotlib\text.py:183, in Text.__init__(self, x, y, text, color, verticalalignment, horizontalalignment, multialignment, fontproperties, rotation, linespacing, rotation_mode, usetex, wrap, transform_rotates_text, parse_math, **kwargs) 167 self._text = '' 168 self._reset_visual_defaults( 169 text=text, 170 color=color, (...) 181 rotation_mode=rotation_mode, 182 ) --> 183 self.update(kwargs) File C:\ProgramData\anaconda3\Lib\site-packages\matplotlib\text.py:223, in Text.update(self, kwargs) 221 def update(self, kwargs): 222 # docstring inherited --> 223 kwargs = cbook.normalize_kwargs(kwargs, Text) 224 sentinel = object() # bbox can be None, so use another sentinel. 225 # Update fontproperties first, as it has lowest priority. File C:\ProgramData\anaconda3\Lib\site-packages\matplotlib\cbook\__init__.py:1779, in normalize_kwargs(kw, alias_mapping) 1777 canonical = to_canonical.get(k, k) 1778 if canonical in canonical_to_seen: -> 1779 raise TypeError(f"Got both {canonical_to_seen[canonical]!r} and " 1780 f"{k!r}, which are aliases of one another") 1781 canonical_to_seen[canonical] = k 1782 ret[canonical] = v TypeError: Got both 'size' and 'fontsize', which are aliases of one another
df_cat = df[['division_name', 'department_name', 'class_name', "recommended_ind"]]
df_cat["recommended_ind"] = df_cat["recommended_ind"].apply(lambda x: "Recommended" if x>=1 else "Not Recommended")
df_cat.rename({'division_name': 'Division Name', 'department_name': 'Department Name', 'class_name': 'Class Name', 'recommended_ind': 'Recommendation Indicator'}, axis=1, inplace=True)
df_cat
| Division Name | Department Name | Class Name | Recommendation Indicator | |
|---|---|---|---|---|
| 0 | Initmates | Intimate | Intimates | Recommended |
| 1 | General | Dresses | Dresses | Recommended |
| 2 | General | Dresses | Dresses | Not Recommended |
| 3 | General Petite | Bottoms | Pants | Recommended |
| 4 | General | Tops | Blouses | Recommended |
| ... | ... | ... | ... | ... |
| 23481 | General Petite | Dresses | Dresses | Recommended |
| 23482 | General Petite | Tops | Knits | Recommended |
| 23483 | General Petite | Dresses | Dresses | Not Recommended |
| 23484 | General | Dresses | Dresses | Recommended |
| 23485 | General Petite | Dresses | Dresses | Recommended |
23486 rows × 4 columns
df_num = df[['age', 'rating', 'positive_feedback_count', 'recommended_ind']]
df_num["recommended_ind"] = df_num["recommended_ind"].apply(lambda x: "Recommended" if x>=1 else "Not Recommended")
df_num.rename({'age': 'Age', 'rating': 'Rating', 'positive_feedback_count': 'Positive Feedback', 'recommended_ind': 'Recommendation Indicator'}, axis=1, inplace=True)
df_num
| Age | Rating | Positive Feedback | Recommendation Indicator | |
|---|---|---|---|---|
| 0 | 33 | 4 | 0 | Recommended |
| 1 | 34 | 5 | 4 | Recommended |
| 2 | 60 | 3 | 0 | Not Recommended |
| 3 | 50 | 5 | 0 | Recommended |
| 4 | 47 | 5 | 6 | Recommended |
| ... | ... | ... | ... | ... |
| 23481 | 34 | 5 | 0 | Recommended |
| 23482 | 48 | 3 | 0 | Recommended |
| 23483 | 31 | 3 | 1 | Not Recommended |
| 23484 | 28 | 3 | 2 | Recommended |
| 23485 | 52 | 5 | 22 | Recommended |
23486 rows × 4 columns
for i, col in enumerate(df_cat.columns):
xtab = pd.crosstab(df_cat[col], df_cat["Recommendation Indicator"], normalize=True)
print(colored('-'*55, 'red', attrs=['bold']), sep='')
print(xtab*100)
------------------------------------------------------- Recommendation Indicator Not Recommended Recommended Division Name General 10.81 48.20 General Petite 6.02 28.57 Initmates 0.95 5.45 ------------------------------------------------------- Recommendation Indicator Not Recommended Recommended Department Name Bottoms 2.41 13.78 Dresses 5.16 21.76 Intimate 1.11 6.28 Jackets 0.72 3.68 Tops 8.24 36.35 Trend 0.13 0.37 ------------------------------------------------------- Recommendation Indicator Not Recommended Recommended Class Name Blouses 2.51 10.69 Casual bottoms 0.00 0.01 Chemises 0.00 0.00 Dresses 5.16 21.76 Fine gauge 0.76 3.92 Intimates 0.09 0.56 Jackets 0.46 2.53 Jeans 0.58 4.31 Knits 3.76 16.87 Layering 0.07 0.55 Legwear 0.10 0.60 Lounge 0.41 2.53 Outerwear 0.26 1.14 Pants 0.99 4.93 Shorts 0.22 1.13 Skirts 0.62 3.40 Sleep 0.14 0.83 Sweaters 1.21 4.87 Swim 0.29 1.20 Trend 0.13 0.37 ------------------------------------------------------- Recommendation Indicator Not Recommended Recommended Recommendation Indicator Not Recommended 17.76 0.00 Recommended 0.00 82.24
for i, col in enumerate(df_num.columns):
xtab = pd.crosstab(df_num[col], df_num["Recommendation Indicator"], normalize=True)
print(colored('-'*55, 'red', attrs=['bold']), sep='')
print(xtab*100)
------------------------------------------------------- Recommendation Indicator Not Recommended Recommended Age 18 0.00 0.02 19 0.02 0.15 20 0.06 0.40 21 0.04 0.40 22 0.06 0.56 23 0.16 0.89 24 0.17 0.87 25 0.27 1.14 26 0.33 1.47 27 0.25 1.21 28 0.44 1.38 29 0.48 1.71 30 0.36 1.37 31 0.52 1.90 32 0.45 2.24 33 0.69 2.40 34 0.61 2.81 35 0.79 3.08 36 0.63 2.96 37 0.68 2.58 38 0.70 2.62 39 0.69 4.72 40 0.52 2.11 41 0.51 2.64 42 0.56 2.21 43 0.51 1.96 44 0.47 2.15 45 0.39 1.86 46 0.69 2.35 47 0.44 1.96 48 0.49 2.18 49 0.33 1.76 50 0.29 1.41 51 0.31 1.36 52 0.29 1.60 53 0.45 1.94 54 0.24 1.44 55 0.22 1.18 56 0.33 1.68 57 0.27 1.28 58 0.15 1.03 59 0.19 1.18 60 0.22 1.23 61 0.13 0.83 62 0.19 1.25 63 0.17 0.85 64 0.18 0.87 65 0.10 0.86 66 0.13 1.04 67 0.14 0.53 68 0.09 0.52 69 0.07 0.41 70 0.06 0.33 71 0.04 0.18 72 0.03 0.17 73 0.03 0.14 74 0.03 0.18 75 0.04 0.07 76 0.01 0.03 77 0.02 0.06 78 0.01 0.05 79 0.00 0.06 80 0.00 0.04 81 0.01 0.01 82 0.01 0.05 83 0.00 0.18 84 0.00 0.02 85 0.00 0.02 86 0.00 0.01 87 0.00 0.02 89 0.00 0.02 90 0.00 0.00 91 0.01 0.01 92 0.00 0.00 93 0.00 0.01 94 0.00 0.01 99 0.00 0.01 ------------------------------------------------------- Recommendation Indicator Not Recommended Recommended Rating 1 3.52 0.07 2 6.26 0.40 3 7.16 5.06 4 0.72 20.90 5 0.11 55.80 ------------------------------------------------------- Recommendation Indicator Not Recommended Recommended Positive Feedback 0 7.25 40.33 1 2.93 14.29 2 1.69 7.64 3 1.16 4.94 4 0.87 3.05 5 0.72 2.14 6 0.50 1.73 7 0.36 1.23 8 0.25 1.11 9 0.25 0.86 10 0.24 0.72 11 0.15 0.60 12 0.13 0.49 13 0.14 0.29 14 0.16 0.36 15 0.13 0.27 16 0.07 0.25 17 0.12 0.23 18 0.07 0.20 19 0.03 0.20 20 0.06 0.11 21 0.04 0.09 22 0.03 0.10 23 0.03 0.10 24 0.02 0.07 25 0.03 0.07 26 0.04 0.06 27 0.03 0.06 28 0.02 0.09 29 0.01 0.06 30 0.02 0.06 31 0.01 0.06 32 0.01 0.03 33 0.00 0.03 34 0.01 0.02 35 0.01 0.01 36 0.01 0.02 37 0.02 0.02 38 0.02 0.02 39 0.01 0.01 40 0.00 0.01 41 0.00 0.01 42 0.03 0.01 43 0.01 0.00 44 0.01 0.00 45 0.00 0.02 46 0.00 0.01 47 0.00 0.01 48 0.00 0.00 49 0.01 0.00 50 0.00 0.00 51 0.01 0.01 52 0.00 0.00 53 0.00 0.01 54 0.00 0.00 55 0.01 0.00 56 0.00 0.00 57 0.00 0.01 58 0.00 0.01 59 0.00 0.00 61 0.00 0.00 64 0.00 0.00 65 0.00 0.01 66 0.00 0.00 68 0.00 0.00 69 0.00 0.00 71 0.00 0.00 77 0.00 0.00 78 0.00 0.00 81 0.00 0.02 82 0.00 0.00 84 0.00 0.00 87 0.00 0.00 89 0.00 0.00 93 0.00 0.00 94 0.00 0.00 95 0.00 0.00 98 0.00 0.00 99 0.00 0.00 108 0.00 0.00 117 0.00 0.00 122 0.00 0.00 ------------------------------------------------------- Recommendation Indicator Not Recommended Recommended Recommendation Indicator Not Recommended 17.76 0.00 Recommended 0.00 82.24
df.columns
Index(['clothing_id', 'age', 'title', 'review_text', 'rating',
'recommended_ind', 'positive_feedback_count', 'division_name',
'department_name', 'class_name'],
dtype='object')
df.drop(['clothing_id', 'age', 'title', 'rating',
'positive_feedback_count', 'division_name',
'department_name', 'class_name'], axis=1, inplace=True)
df.head(3)
| review_text | recommended_ind | |
|---|---|---|
| 0 | Absolutely wonderful - silky and sexy and comfortable | 1 |
| 1 | Love this dress! it's sooo pretty. i happened to find it in a store, and i'm glad i did bc i never would have ordered it online bc it's petite. i bought a petite and am 5'8". i love the length... | 1 |
| 2 | I had such high hopes for this dress and really wanted it to work for me. i initially ordered the petite small (my usual size) but i found this to be outrageously small. so small in fact that i co... | 0 |
df['review_text'].isnull().value_counts()
review_text False 22641 True 845 Name: count, dtype: int64
df['recommended_ind'].isnull().value_counts()
recommended_ind False 23486 Name: count, dtype: int64
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 23486 entries, 0 to 23485 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 review_text 22641 non-null object 1 recommended_ind 23486 non-null int64 dtypes: int64(1), object(1) memory usage: 367.1+ KB
df = df.dropna()
df.info()
<class 'pandas.core.frame.DataFrame'> Index: 22641 entries, 0 to 23485 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 review_text 22641 non-null object 1 recommended_ind 22641 non-null int64 dtypes: int64(1), object(1) memory usage: 530.6+ KB
df['review_text'].isnull().value_counts()
review_text False 22641 Name: count, dtype: int64
df['recommended_ind'].isnull().value_counts()
recommended_ind False 22641 Name: count, dtype: int64
missing_values(df)
| Missing_Number | Missing_Percent |
|---|
blanks = [] # start with an empty list
for rv in df.itertuples(): # iterate over the DataFrame
if type(rv)==str and rv.isspace(): # avoid NaN values and test 'review' for whitespace
blanks.append(i)
blanks
[]
df["review_text"].str.isspace().sum()
0
df[df["review_text"].str.isspace() == True].index
Index([], dtype='int64')
def cleaning_fsa(data):
import re
#1. Remove Puncs
# \w typically matches [A-Za-z0-9_]
text = re.sub('[^\w\s]','', data)
#2. Tokenize
text_tokens = word_tokenize(text.lower())
#3. Remove numbers
tokens_without_punc = [w for w in text_tokens if w.isalpha()]
#4. Removing Stopwords
tokens_without_sw = [t for t in tokens_without_punc if t not in stop_words]
#5. lemma
text_cleaned = [WordNetLemmatizer().lemmatize(t) for t in tokens_without_sw]
#joining
return " ".join(text_cleaned)
stop_words = stopwords.words('english')
text = cleaning_fsa(str(df["review_text"]))
text
'absolutely wonderful silky sexy comfortable love dress sooo pretty happened find store im glad bc never would ordered online bc petite bought petite love length high hope dress really wanted work initially ordered petite small usual size found outrageously small small fact co love love love jumpsuit fun flirty fabulous every time wear get nothing great compliment shirt flattering due adjustable front tie perfect length wear legging sleeveless pair well cardigan love shirt happy snag dress great price easy slip flattering cut color combo reminds maternity clothes soft stretchy shiny material cut flattering drape nicely found one button close front looked awkward nice long sleevesnnot bu fit well top see never would worked im glad able try store didnt order online different fabric would bought dress wedding summer cute unfortunately fit isnt perfect medium fit waist perfectly way long big bust sh dress lovely platinum feminine fit perfectly easy wear comfy highly recommend name length dtype object'
df["review_text"] = df["review_text"].apply(cleaning_fsa)
df["review_text"].head()
0 absolutely wonderful silky sexy comfortable 1 love dress sooo pretty happened find store im glad bc never would ordered online bc petite bought petite love length hit little knee would definitely true midi someone truly petite 2 high hope dress really wanted work initially ordered petite small usual size found outrageously small small fact could zip reordered petite medium ok overall top half comfortable fit nicely bottom... 3 love love love jumpsuit fun flirty fabulous every time wear get nothing great compliment 4 shirt flattering due adjustable front tie perfect length wear legging sleeveless pair well cardigan love shirt Name: review_text, dtype: object
df.head(3)
| review_text | recommended_ind | |
|---|---|---|
| 0 | absolutely wonderful silky sexy comfortable | 1 |
| 1 | love dress sooo pretty happened find store im glad bc never would ordered online bc petite bought petite love length hit little knee would definitely true midi someone truly petite | 1 |
| 2 | high hope dress really wanted work initially ordered petite small usual size found outrageously small small fact could zip reordered petite medium ok overall top half comfortable fit nicely bottom... | 0 |
" ".join(df["review_text"]).split()
['absolutely', 'wonderful', 'silky', 'sexy', 'comfortable', 'love', 'dress', 'sooo', 'pretty', 'happened', 'find', 'store', 'im', 'glad', 'bc', 'never', 'would', 'ordered', 'online', 'bc', 'petite', 'bought', 'petite', 'love', 'length', 'hit', 'little', 'knee', 'would', 'definitely', 'true', 'midi', 'someone', 'truly', 'petite', 'high', 'hope', 'dress', 'really', 'wanted', 'work', 'initially', 'ordered', 'petite', 'small', 'usual', 'size', 'found', 'outrageously', 'small', 'small', 'fact', 'could', 'zip', 'reordered', 'petite', 'medium', 'ok', 'overall', 'top', 'half', 'comfortable', 'fit', 'nicely', 'bottom', 'half', 'tight', 'layer', 'several', 'somewhat', 'cheap', 'net', 'layer', 'imo', 'major', 'design', 'flaw', 'net', 'layer', 'sewn', 'directly', 'zipper', 'c', 'love', 'love', 'love', 'jumpsuit', 'fun', 'flirty', 'fabulous', 'every', 'time', 'wear', 'get', 'nothing', 'great', 'compliment', 'shirt', 'flattering', 'due', 'adjustable', 'front', 'tie', 'perfect', 'length', 'wear', 'legging', 'sleeveless', 'pair', 'well', 'cardigan', 'love', 'shirt', 'love', 'tracy', 'reese', 'dress', 'one', 'petite', 'foot', 'tall', 'usually', 'wear', 'brand', 'dress', 'pretty', 'package', 'lot', 'dress', 'skirt', 'long', 'full', 'overwhelmed', 'small', 'frame', 'stranger', 'alteration', 'shortening', 'narrowing', 'skirt', 'would', 'take', 'away', 'embellishment', 'garment', 'love', 'color', 'idea', 'style', 'work', 'returned', 'dress', 'aded', 'basket', 'hte', 'last', 'mintue', 'see', 'would', 'look', 'like', 'person', 'store', 'pick', 'went', 'teh', 'darkler', 'color', 'pale', 'hte', 'color', 'really', 'gorgeous', 'turn', 'mathced', 'everythiing', 'trying', 'prefectly', 'little', 'baggy', 'hte', 'x', 'hte', 'msallet', 'size', 'bummer', 'petite', 'decided', 'jkeep', 'though', 'said', 'matvehd', 'everything', 'ejans', 'pant', 'skirt', 'waas', 'trying', 'kept', 'oops', 'ordered', 'carbon', 'store', 'pick', 'ton', 'stuff', 'always', 'try', 'used', 'top', 'pair', 'skirt', 'pant', 'everything', 'went', 'color', 'really', 'nice', 'charcoal', 'shimmer', 'went', 'well', 'pencil', 'skirt', 'flare', 'pant', 'etc', 'compaint', 'bit', 'big', 'sleeve', 'long', 'doesnt', 'go', 'petite', 'also', 'bit', 'loose', 'xx', 'kept', 'wil', 'ldecide', 'later', 'since', 'light', 'color', 'already', 'sold', 'hte', 'smallest', 'size', 'love', 'dress', 'usually', 'get', 'x', 'run', 'little', 'snug', 'bust', 'ordered', 'size', 'flattering', 'feminine', 'usual', 'retailer', 'flair', 'style', 'im', 'lb', 'ordered', 'petite', 'make', 'sure', 'length', 'wasnt', 'long', 'typically', 'wear', 'x', 'regular', 'retailer', 'dress', 'youre', 'le', 'busty', 'cup', 'smaller', 'petite', 'fit', 'perfectly', 'snug', 'tight', 'love', 'could', 'dress', 'party', 'work', 'love', 'tulle', 'longer', 'fabric', 'underneath', 'dress', 'run', 'small', 'esp', 'zipper', 'area', 'run', 'ordered', 'sp', 'typically', 'fit', 'tight', 'material', 'top', 'look', 'feel', 'cheap', 'even', 'pulling', 'cause', 'rip', 'fabric', 'pretty', 'disappointed', 'going', 'christmas', 'dress', 'year', 'needle', 'say', 'going', 'back', 'dress', 'perfection', 'pretty', 'flattering', 'find', 'reliant', 'review', 'written', 'savvy', 'shopper', 'past', 'right', 'estimation', 'product', 'case', 'dressif', 'reveiwsi', 'doubt', 'would', 'even', 'tried', 'dress', 'beautifully', 'made', 'lined', 'reminiscent', 'old', 'retailer', 'quality', 'lined', 'solid', 'periwinklecolored', 'fabric', 'match', 'outer', 'fabric', 'print', 'tt', 'formfitting', 'fall', 'knee', 'rid', 'bought', 'black', 'x', 'go', 'larkspur', 'midi', 'dress', 'didnt', 'bother', 'lining', 'skirt', 'portion', 'grrrrrrrrrrr', 'stats', 'x', 'fit', 'smoothly', 'around', 'chest', 'flowy', 'around', 'lower', 'half', 'would', 'say', 'running', 'big', 'strap', 'pretty', 'could', 'easily', 'nightwear', 'im', 'came', 'knee', 'nice', 'choice', 'holiday', 'gathering', 'like', 'length', 'graz', 'knee', 'conservative', 'enough', 'office', 'related', 'gathering', 'size', 'small', 'fit', 'well', 'usually', 'size', 'small', 'bust', 'opinion', 'run', 'small', 'larger', 'bust', 'definitely', 'size', 'perhaps', 'waist', 'big', 'problem', 'dress', 'quality', 'fabric', 'terrible', 'delicate', 'netting', 'type', 'fabric', 'top', 'layer', 'skirt', 'got', 'stuck', 'zip', 'took', 'package', 'wanted', 'fit', 'badly', 'could', 'tell', 'put', 'wouldnt', 'hourglass', 'figure', 'straight', 'waist', 'way', 'small', 'body', 'shape', 'even', 'sized', 'could', 'tell', 'would', 'still', 'tight', 'waist', 'roomy', 'hip', 'said', 'really', 'nice', 'sturdy', 'linenlike', 'fabric', 'pretty', 'color', 'well', 'made', 'hope', 'make', 'someone', 'happy', 'material', 'color', 'nice', 'leg', 'opening', 'large', 'length', 'hit', 'right', 'ankle', 'leg', 'opening', 'size', 'waist', 'hem', 'line', 'ankle', 'front', 'pleat', 'make', 'fluffy', 'think', 'imagine', 'flattering', 'look', 'least', 'average', 'height', 'taller', 'may', 'look', 'good', 'took', 'chance', 'blouse', 'glad', 'wasnt', 'crazy', 'blouse', 'photographed', 'model', 'paired', 'whit', 'white', 'pant', 'worked', 'perfectly', 'crisp', 'clean', 'would', 'describe', 'launders', 'well', 'fit', 'great', 'drape', 'perfect', 'wear', 'tucked', 'cant', 'go', 'wrong', 'flattering', 'super', 'cozy', 'coat', 'work', 'well', 'cold', 'dry', 'day', 'look', 'good', 'jean', 'dressier', 'outfit', 'small', 'fit', 'great', 'love', 'look', 'feel', 'tulle', 'dress', 'looking', 'something', 'different', 'top', 'new', 'year', 'eve', 'im', 'small', 'chested', 'top', 'dress', 'form', 'fitting', 'flattering', 'look', 'steamed', 'tulle', 'perfect', 'ordered', 'xsp', 'length', 'perfect', 'product', 'petite', 'would', 'get', 'petite', 'regular', 'little', 'long', 'tailor', 'simple', 'fix', 'fit', 'nicely', 'im', 'pregnant', 'bough', 'medium', 'grow', 'tie', 'front', 'back', 'provides', 'nice', 'flexibility', 'form', 'fitting', 'im', 'upset', 'price', 'dress', 'thought', 'embroidered', 'print', 'fabric', 'think', 'cried', 'little', 'opened', 'box', 'still', 'ver', 'pretty', 'would', 'say', 'true', 'size', 'tad', 'bit', 'big', 'tiny', 'still', 'get', 'away', 'color', 'vibrant', 'style', 'unique', 'skirt', 'portion', 'pretty', 'poofy', 'keep', 'going', 'back', 'forth', 'mainly', 'price', 'although', 'quality', 'definitely', 'except', 'wish', 'emb', 'first', 'pullover', 'styling', 'side', 'zipper', 'wouldnt', 'purchased', 'knew', 'side', 'zipper', 'large', 'bust', 'side', 'zipper', 'next', 'impossible', 'second', 'tulle', 'feel', 'look', 'cheap', 'slip', 'awkward', 'tight', 'shape', 'underneath', 'look', 'like', 'described', 'sadly', 'returning', 'im', 'sure', 'find', 'something', 'exchange', 'cute', 'little', 'dress', 'fit', 'tt', 'little', 'high', 'waisted', 'good', 'length', 'height', 'like', 'dress', 'im', 'love', 'dont', 'think', 'look', 'feel', 'cheap', 'appears', 'pictured', 'love', 'shirt', 'first', 'saw', 'wasnt', 'sure', 'shirt', 'dress', 'since', 'seethrough', 'wear', 'like', 'dress', 'need', 'slip', 'wear', 'legging', 'bought', 'slip', 'wore', 'tie', 'back', 'rocked', 'white', 'wedge', 'could', 'also', 'wear', 'vest', 'careful', 'button', 'havent', 'fall', 'yet', 'feel', 'like', 'overall', 'great', 'occasion', 'fun', 'wear', 'loved', 'material', 'didnt', 'really', 'look', 'long', 'dress', 'purchased', 'large', 'medium', 'im', 'atleast', 'material', 'foot', 'gap', 'front', 'much', 'wider', 'look', 'felt', 'like', 'dress', 'fell', 'flat', 'returned', 'im', 'usually', 'large', 'med', 'fit', 'better', 'jean', 'waiting', 'sweater', 'coat', 'ship', 'week', 'excited', 'arrive', 'coat', 'true', 'size', 'made', 'look', 'short', 'squat', 'sleeve', 'wide', 'although', 'long', 'light', 'weight', 'fall', 'coat', 'sleeve', 'dont', 'need', 'wide', 'wouldnt', 'layerng', 'much', 'underneath', 'button', 'need', 'moved', 'least', 'three', 'inch', 'nicer', 'fit', 'thought', 'redoing', 'button', 'sleeve', 'looked', 'even', 'proportion', 'tigh', 'color', 'werent', 'expected', 'either', 'dark', 'blue', 'much', 'vibrant', 'couldnt', 'find', 'anything', 'really', 'go', 'fabric', 'thick', 'good', 'quality', 'nice', 'weight', 'movement', 'skirt', 'wasnt', 'end', 'several', 'goodhyouman', 'shirt', 'get', 'many', 'compliment', 'especially', 'one', 'say', 'forehead', 'kiss', 'underrated', 'dont', 'hesitate', 'buy', 'shirt', 'wont', 'sorry', 'sweater', 'comfy', 'classic', 'balance', 'quirky', 'handknit', 'look', 'beautiful', 'color', 'practical', 'fit', 'bit', 'cropped', 'boxy', 'part', 'style', 'others', 'mentioned', 'gap', 'knit', 'make', 'seethrough', 'opinion', 'make', 'perfect', 'layering', 'like', 'longer', 'camisole', 'showing', 'underneath', 'wearing', 'little', 'dress', 'warm', 'still', 'thin', 'enough', 'fit', 'jacket', 'coat', 'beautifully', 'made', 'pant', 'trend', 'flared', 'crop', 'much', 'cuter', 'person', 'love', 'never', 'would', 'given', 'pant', 'second', 'look', 'online', 'person', 'much', 'cuter', 'stripe', 'brighter', 'fit', 'flattering', 'crop', 'cute', 'flare', 'right', 'trend', 'brand', 'always', 'run', 'small', 'carry', 'chubbiness', 'belly', 'paired', 'collarless', 'loose', 'navy', 'blazer', 'pant', 'even', 'better', 'person', 'downside', 'need', 'dry', 'cleaned', 'ordered', 'month', 'ago', 'finally', 'came', 'back', 'order', 'huge', 'disappointment', 'fit', 'much', 'issue', 'quality', ...]
word_values = pd.Series(" ".join(df["review_text"]).split()).value_counts()
word_values
dress 11319
fit 10091
size 9349
love 8968
top 8256
...
takeout 1
teenybop 1
hugely 1
shirred 1
platinum 1
Name: count, Length: 16758, dtype: int64
rare_words = word_values[word_values <= 2]
rare_words
wristforearm 2
magnolia 2
smalltight 2
urban 2
underrated 2
..
takeout 1
teenybop 1
hugely 1
shirred 1
platinum 1
Name: count, Length: 10935, dtype: int64
rare_words.value_counts()
count 1 9058 2 1877 Name: count, dtype: int64
len(rare_words)
10935
rare_words.index
Index(['wristforearm', 'magnolia', 'smalltight', 'urban', 'underrated',
'henleys', 'greenyellow', 'outrageous', 'blueteal', 'status',
...
'quaiity', 'befo', 'slam', 'brokenin', 'accomadate', 'takeout',
'teenybop', 'hugely', 'shirred', 'platinum'],
dtype='object', length=10935)
df["review_text"] = df["review_text"].apply(lambda x: " ".join([i for i in x.split() if i not in rare_words.index]))
df["review_text"].head()
0 absolutely wonderful silky sexy comfortable 1 love dress sooo pretty happened find store im glad bc never would ordered online bc petite bought petite love length hit little knee would definitely true midi someone truly petite 2 high hope dress really wanted work initially ordered petite small usual size found small small fact could zip reordered petite medium ok overall top half comfortable fit nicely bottom half tight l... 3 love love love jumpsuit fun flirty fabulous every time wear get nothing great compliment 4 shirt flattering due adjustable front tie perfect length wear legging sleeveless pair well cardigan love shirt Name: review_text, dtype: object
df.info()
<class 'pandas.core.frame.DataFrame'> Index: 22641 entries, 0 to 23485 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 review_text 22641 non-null object 1 recommended_ind 22641 non-null int64 dtypes: int64(1), object(1) memory usage: 530.6+ KB
df.head(3)
| review_text | recommended_ind | |
|---|---|---|
| 0 | absolutely wonderful silky sexy comfortable | 1 |
| 1 | love dress sooo pretty happened find store im glad bc never would ordered online bc petite bought petite love length hit little knee would definitely true midi someone truly petite | 1 |
| 2 | high hope dress really wanted work initially ordered petite small usual size found small small fact could zip reordered petite medium ok overall top half comfortable fit nicely bottom half tight l... | 0 |
df.columns
Index(['review_text', 'recommended_ind'], dtype='object')
df[df["recommended_ind"] == 0]
| review_text | recommended_ind | |
|---|---|---|
| 2 | high hope dress really wanted work initially ordered petite small usual size found small small fact could zip reordered petite medium ok overall top half comfortable fit nicely bottom half tight l... | 0 |
| 5 | love tracy reese dress one petite foot tall usually wear brand dress pretty package lot dress skirt long full overwhelmed small frame stranger alteration shortening skirt would take away embellish... | 0 |
| 10 | dress run small esp zipper area run ordered sp typically fit tight material top look feel cheap even pulling cause rip fabric pretty disappointed going christmas dress year needle say going back | 0 |
| 22 | first pullover styling side zipper wouldnt purchased knew side zipper large bust side zipper next impossible second tulle feel look cheap slip awkward tight shape underneath look like described sa... | 0 |
| 25 | loved material didnt really look long dress purchased large medium im atleast material foot gap front much wider look felt like dress fell flat returned im usually large med fit better jean | 0 |
| ... | ... | ... |
| 23449 | tried color really pretty fun want pair pant looked awful muscular thigh dont think cut falttering also comfortable looked crotch seemed little low though pant correct size | 0 |
| 23450 | wanted love jacket soft great color unfortunately light weight almost like shirt fabric nice heft fabric though nice length accept sleeve short small right size looking something slightly warmer l... | 0 |
| 23460 | purchased good price typically love maeve winwin fabric thin slinky unfortunate way made finding appropriate undergarment difficult add slip helped figured since im losing weight would look better... | 0 |
| 23478 | surprised positive review product terrible cut weird place make look wide skirt also like picture darker heavier material isnt great return | 0 |
| 23483 | fit well top see never would worked im glad able try store didnt order online different fabric would great | 0 |
4101 rows × 2 columns
df[df["recommended_ind"] == 1]
| review_text | recommended_ind | |
|---|---|---|
| 0 | absolutely wonderful silky sexy comfortable | 1 |
| 1 | love dress sooo pretty happened find store im glad bc never would ordered online bc petite bought petite love length hit little knee would definitely true midi someone truly petite | 1 |
| 3 | love love love jumpsuit fun flirty fabulous every time wear get nothing great compliment | 1 |
| 4 | shirt flattering due adjustable front tie perfect length wear legging sleeveless pair well cardigan love shirt | 1 |
| 6 | basket hte last see would look like person store pick went teh color pale hte color really gorgeous turn trying little baggy hte x hte size bummer petite decided though said everything pant skirt ... | 1 |
| ... | ... | ... |
| 23479 | wasnt sure ordering skirt couldnt see person first im glad skirt design well made doesnt look feel cheap color isnt red worked pale skin better anticipated wore daughter dance production tall boot... | 1 |
| 23481 | happy snag dress great price easy slip flattering cut color combo | 1 |
| 23482 | reminds maternity clothes soft stretchy shiny material cut flattering drape nicely found one button close front looked awkward nice long sleeve maybe others ok | 1 |
| 23484 | bought dress wedding summer cute unfortunately fit isnt perfect medium fit waist perfectly way long big bust shoulder wanted spend money could get tailored felt like might worth side note dress de... | 1 |
| 23485 | dress lovely feminine fit perfectly easy wear comfy highly recommend | 1 |
18540 rows × 2 columns
df["recommended_ind"].value_counts()
recommended_ind 1 18540 0 4101 Name: count, dtype: int64
" ".join(df["review_text"]).split()
['absolutely', 'wonderful', 'silky', 'sexy', 'comfortable', 'love', 'dress', 'sooo', 'pretty', 'happened', 'find', 'store', 'im', 'glad', 'bc', 'never', 'would', 'ordered', 'online', 'bc', 'petite', 'bought', 'petite', 'love', 'length', 'hit', 'little', 'knee', 'would', 'definitely', 'true', 'midi', 'someone', 'truly', 'petite', 'high', 'hope', 'dress', 'really', 'wanted', 'work', 'initially', 'ordered', 'petite', 'small', 'usual', 'size', 'found', 'small', 'small', 'fact', 'could', 'zip', 'reordered', 'petite', 'medium', 'ok', 'overall', 'top', 'half', 'comfortable', 'fit', 'nicely', 'bottom', 'half', 'tight', 'layer', 'several', 'somewhat', 'cheap', 'net', 'layer', 'imo', 'major', 'design', 'flaw', 'net', 'layer', 'sewn', 'directly', 'zipper', 'c', 'love', 'love', 'love', 'jumpsuit', 'fun', 'flirty', 'fabulous', 'every', 'time', 'wear', 'get', 'nothing', 'great', 'compliment', 'shirt', 'flattering', 'due', 'adjustable', 'front', 'tie', 'perfect', 'length', 'wear', 'legging', 'sleeveless', 'pair', 'well', 'cardigan', 'love', 'shirt', 'love', 'tracy', 'reese', 'dress', 'one', 'petite', 'foot', 'tall', 'usually', 'wear', 'brand', 'dress', 'pretty', 'package', 'lot', 'dress', 'skirt', 'long', 'full', 'overwhelmed', 'small', 'frame', 'stranger', 'alteration', 'shortening', 'skirt', 'would', 'take', 'away', 'embellishment', 'garment', 'love', 'color', 'idea', 'style', 'work', 'returned', 'dress', 'basket', 'hte', 'last', 'see', 'would', 'look', 'like', 'person', 'store', 'pick', 'went', 'teh', 'color', 'pale', 'hte', 'color', 'really', 'gorgeous', 'turn', 'trying', 'little', 'baggy', 'hte', 'x', 'hte', 'size', 'bummer', 'petite', 'decided', 'though', 'said', 'everything', 'pant', 'skirt', 'trying', 'kept', 'oops', 'ordered', 'carbon', 'store', 'pick', 'ton', 'stuff', 'always', 'try', 'used', 'top', 'pair', 'skirt', 'pant', 'everything', 'went', 'color', 'really', 'nice', 'charcoal', 'shimmer', 'went', 'well', 'pencil', 'skirt', 'flare', 'pant', 'etc', 'bit', 'big', 'sleeve', 'long', 'doesnt', 'go', 'petite', 'also', 'bit', 'loose', 'xx', 'kept', 'wil', 'later', 'since', 'light', 'color', 'already', 'sold', 'hte', 'smallest', 'size', 'love', 'dress', 'usually', 'get', 'x', 'run', 'little', 'snug', 'bust', 'ordered', 'size', 'flattering', 'feminine', 'usual', 'retailer', 'flair', 'style', 'im', 'lb', 'ordered', 'petite', 'make', 'sure', 'length', 'wasnt', 'long', 'typically', 'wear', 'x', 'regular', 'retailer', 'dress', 'youre', 'le', 'busty', 'cup', 'smaller', 'petite', 'fit', 'perfectly', 'snug', 'tight', 'love', 'could', 'dress', 'party', 'work', 'love', 'tulle', 'longer', 'fabric', 'underneath', 'dress', 'run', 'small', 'esp', 'zipper', 'area', 'run', 'ordered', 'sp', 'typically', 'fit', 'tight', 'material', 'top', 'look', 'feel', 'cheap', 'even', 'pulling', 'cause', 'rip', 'fabric', 'pretty', 'disappointed', 'going', 'christmas', 'dress', 'year', 'needle', 'say', 'going', 'back', 'dress', 'perfection', 'pretty', 'flattering', 'find', 'review', 'written', 'savvy', 'shopper', 'past', 'right', 'product', 'case', 'doubt', 'would', 'even', 'tried', 'dress', 'beautifully', 'made', 'lined', 'reminiscent', 'old', 'retailer', 'quality', 'lined', 'solid', 'fabric', 'match', 'outer', 'fabric', 'print', 'tt', 'formfitting', 'fall', 'knee', 'rid', 'bought', 'black', 'x', 'go', 'midi', 'dress', 'didnt', 'bother', 'lining', 'skirt', 'portion', 'stats', 'x', 'fit', 'smoothly', 'around', 'chest', 'flowy', 'around', 'lower', 'half', 'would', 'say', 'running', 'big', 'strap', 'pretty', 'could', 'easily', 'im', 'came', 'knee', 'nice', 'choice', 'holiday', 'gathering', 'like', 'length', 'knee', 'conservative', 'enough', 'office', 'related', 'gathering', 'size', 'small', 'fit', 'well', 'usually', 'size', 'small', 'bust', 'opinion', 'run', 'small', 'larger', 'bust', 'definitely', 'size', 'perhaps', 'waist', 'big', 'problem', 'dress', 'quality', 'fabric', 'terrible', 'delicate', 'netting', 'type', 'fabric', 'top', 'layer', 'skirt', 'got', 'stuck', 'zip', 'took', 'package', 'wanted', 'fit', 'badly', 'could', 'tell', 'put', 'wouldnt', 'hourglass', 'figure', 'straight', 'waist', 'way', 'small', 'body', 'shape', 'even', 'sized', 'could', 'tell', 'would', 'still', 'tight', 'waist', 'roomy', 'hip', 'said', 'really', 'nice', 'sturdy', 'linenlike', 'fabric', 'pretty', 'color', 'well', 'made', 'hope', 'make', 'someone', 'happy', 'material', 'color', 'nice', 'leg', 'opening', 'large', 'length', 'hit', 'right', 'ankle', 'leg', 'opening', 'size', 'waist', 'hem', 'line', 'ankle', 'front', 'pleat', 'make', 'fluffy', 'think', 'imagine', 'flattering', 'look', 'least', 'average', 'height', 'taller', 'may', 'look', 'good', 'took', 'chance', 'blouse', 'glad', 'wasnt', 'crazy', 'blouse', 'photographed', 'model', 'paired', 'whit', 'white', 'pant', 'worked', 'perfectly', 'crisp', 'clean', 'would', 'describe', 'launders', 'well', 'fit', 'great', 'drape', 'perfect', 'wear', 'tucked', 'cant', 'go', 'wrong', 'flattering', 'super', 'cozy', 'coat', 'work', 'well', 'cold', 'dry', 'day', 'look', 'good', 'jean', 'dressier', 'outfit', 'small', 'fit', 'great', 'love', 'look', 'feel', 'tulle', 'dress', 'looking', 'something', 'different', 'top', 'new', 'year', 'eve', 'im', 'small', 'chested', 'top', 'dress', 'form', 'fitting', 'flattering', 'look', 'steamed', 'tulle', 'perfect', 'ordered', 'xsp', 'length', 'perfect', 'product', 'petite', 'would', 'get', 'petite', 'regular', 'little', 'long', 'tailor', 'simple', 'fix', 'fit', 'nicely', 'im', 'pregnant', 'bough', 'medium', 'grow', 'tie', 'front', 'back', 'provides', 'nice', 'flexibility', 'form', 'fitting', 'im', 'upset', 'price', 'dress', 'thought', 'embroidered', 'print', 'fabric', 'think', 'little', 'opened', 'box', 'still', 'ver', 'pretty', 'would', 'say', 'true', 'size', 'tad', 'bit', 'big', 'tiny', 'still', 'get', 'away', 'color', 'vibrant', 'style', 'unique', 'skirt', 'portion', 'pretty', 'poofy', 'keep', 'going', 'back', 'forth', 'mainly', 'price', 'although', 'quality', 'definitely', 'except', 'wish', 'first', 'pullover', 'styling', 'side', 'zipper', 'wouldnt', 'purchased', 'knew', 'side', 'zipper', 'large', 'bust', 'side', 'zipper', 'next', 'impossible', 'second', 'tulle', 'feel', 'look', 'cheap', 'slip', 'awkward', 'tight', 'shape', 'underneath', 'look', 'like', 'described', 'sadly', 'returning', 'im', 'sure', 'find', 'something', 'exchange', 'cute', 'little', 'dress', 'fit', 'tt', 'little', 'high', 'waisted', 'good', 'length', 'height', 'like', 'dress', 'im', 'love', 'dont', 'think', 'look', 'feel', 'cheap', 'appears', 'pictured', 'love', 'shirt', 'first', 'saw', 'wasnt', 'sure', 'shirt', 'dress', 'since', 'seethrough', 'wear', 'like', 'dress', 'need', 'slip', 'wear', 'legging', 'bought', 'slip', 'wore', 'tie', 'back', 'white', 'wedge', 'could', 'also', 'wear', 'vest', 'careful', 'button', 'havent', 'fall', 'yet', 'feel', 'like', 'overall', 'great', 'occasion', 'fun', 'wear', 'loved', 'material', 'didnt', 'really', 'look', 'long', 'dress', 'purchased', 'large', 'medium', 'im', 'atleast', 'material', 'foot', 'gap', 'front', 'much', 'wider', 'look', 'felt', 'like', 'dress', 'fell', 'flat', 'returned', 'im', 'usually', 'large', 'med', 'fit', 'better', 'jean', 'waiting', 'sweater', 'coat', 'ship', 'week', 'excited', 'arrive', 'coat', 'true', 'size', 'made', 'look', 'short', 'squat', 'sleeve', 'wide', 'although', 'long', 'light', 'weight', 'fall', 'coat', 'sleeve', 'dont', 'need', 'wide', 'wouldnt', 'much', 'underneath', 'button', 'need', 'moved', 'least', 'three', 'inch', 'nicer', 'fit', 'thought', 'button', 'sleeve', 'looked', 'even', 'proportion', 'tigh', 'color', 'werent', 'expected', 'either', 'dark', 'blue', 'much', 'vibrant', 'couldnt', 'find', 'anything', 'really', 'go', 'fabric', 'thick', 'good', 'quality', 'nice', 'weight', 'movement', 'skirt', 'wasnt', 'end', 'several', 'shirt', 'get', 'many', 'compliment', 'especially', 'one', 'say', 'dont', 'hesitate', 'buy', 'shirt', 'wont', 'sorry', 'sweater', 'comfy', 'classic', 'balance', 'quirky', 'handknit', 'look', 'beautiful', 'color', 'practical', 'fit', 'bit', 'cropped', 'boxy', 'part', 'style', 'others', 'mentioned', 'gap', 'knit', 'make', 'seethrough', 'opinion', 'make', 'perfect', 'layering', 'like', 'longer', 'camisole', 'showing', 'underneath', 'wearing', 'little', 'dress', 'warm', 'still', 'thin', 'enough', 'fit', 'jacket', 'coat', 'beautifully', 'made', 'pant', 'trend', 'flared', 'crop', 'much', 'cuter', 'person', 'love', 'never', 'would', 'given', 'pant', 'second', 'look', 'online', 'person', 'much', 'cuter', 'stripe', 'brighter', 'fit', 'flattering', 'crop', 'cute', 'flare', 'right', 'trend', 'brand', 'always', 'run', 'small', 'carry', 'belly', 'paired', 'loose', 'navy', 'blazer', 'pant', 'even', 'better', 'person', 'downside', 'need', 'dry', 'cleaned', 'ordered', 'month', 'ago', 'finally', 'came', 'back', 'order', 'huge', 'disappointment', 'fit', 'much', 'issue', 'quality', 'wool', 'subpar', 'someone', 'else', 'mentioned', 'guess', 'call', 'literally', 'feel', 'like', 'felt', 'super', 'thin', 'itchy', 'drape', 'well', 'feel', 'cheap', 'made', 'china', 'got', 'sale', 'still', 'worth', 'paid', 'definitely', 'going', 'back', 'neat', 'dress', 'color', 'great', 'fabric', 'super', 'soft', ...]
neg_words = " ".join(df[df["recommended_ind"] == 0].review_text).split()
neg_words
['high', 'hope', 'dress', 'really', 'wanted', 'work', 'initially', 'ordered', 'petite', 'small', 'usual', 'size', 'found', 'small', 'small', 'fact', 'could', 'zip', 'reordered', 'petite', 'medium', 'ok', 'overall', 'top', 'half', 'comfortable', 'fit', 'nicely', 'bottom', 'half', 'tight', 'layer', 'several', 'somewhat', 'cheap', 'net', 'layer', 'imo', 'major', 'design', 'flaw', 'net', 'layer', 'sewn', 'directly', 'zipper', 'c', 'love', 'tracy', 'reese', 'dress', 'one', 'petite', 'foot', 'tall', 'usually', 'wear', 'brand', 'dress', 'pretty', 'package', 'lot', 'dress', 'skirt', 'long', 'full', 'overwhelmed', 'small', 'frame', 'stranger', 'alteration', 'shortening', 'skirt', 'would', 'take', 'away', 'embellishment', 'garment', 'love', 'color', 'idea', 'style', 'work', 'returned', 'dress', 'dress', 'run', 'small', 'esp', 'zipper', 'area', 'run', 'ordered', 'sp', 'typically', 'fit', 'tight', 'material', 'top', 'look', 'feel', 'cheap', 'even', 'pulling', 'cause', 'rip', 'fabric', 'pretty', 'disappointed', 'going', 'christmas', 'dress', 'year', 'needle', 'say', 'going', 'back', 'first', 'pullover', 'styling', 'side', 'zipper', 'wouldnt', 'purchased', 'knew', 'side', 'zipper', 'large', 'bust', 'side', 'zipper', 'next', 'impossible', 'second', 'tulle', 'feel', 'look', 'cheap', 'slip', 'awkward', 'tight', 'shape', 'underneath', 'look', 'like', 'described', 'sadly', 'returning', 'im', 'sure', 'find', 'something', 'exchange', 'loved', 'material', 'didnt', 'really', 'look', 'long', 'dress', 'purchased', 'large', 'medium', 'im', 'atleast', 'material', 'foot', 'gap', 'front', 'much', 'wider', 'look', 'felt', 'like', 'dress', 'fell', 'flat', 'returned', 'im', 'usually', 'large', 'med', 'fit', 'better', 'jean', 'waiting', 'sweater', 'coat', 'ship', 'week', 'excited', 'arrive', 'coat', 'true', 'size', 'made', 'look', 'short', 'squat', 'sleeve', 'wide', 'although', 'long', 'light', 'weight', 'fall', 'coat', 'sleeve', 'dont', 'need', 'wide', 'wouldnt', 'much', 'underneath', 'button', 'need', 'moved', 'least', 'three', 'inch', 'nicer', 'fit', 'thought', 'button', 'sleeve', 'looked', 'even', 'proportion', 'tigh', 'ordered', 'month', 'ago', 'finally', 'came', 'back', 'order', 'huge', 'disappointment', 'fit', 'much', 'issue', 'quality', 'wool', 'subpar', 'someone', 'else', 'mentioned', 'guess', 'call', 'literally', 'feel', 'like', 'felt', 'super', 'thin', 'itchy', 'drape', 'well', 'feel', 'cheap', 'made', 'china', 'got', 'sale', 'still', 'worth', 'paid', 'definitely', 'going', 'back', 'pregnant', 'thought', 'would', 'great', 'sleep', 'bra', 'soft', 'fit', 'okay', 'zero', 'support', 'shape', 'would', 'buy', 'b', 'cup', 'smaller', 'get', 'away', 'without', 'support', 'would', 'seen', 'store', 'would', 'passed', 'however', 'lazy', 'return', 'wearing', 'comfortable', 'thats', 'redeeming', 'quality', 'would', 'recommend', 'larger', 'chested', 'lady', 'though', 'tank', 'fit', 'well', 'loved', 'ruffle', 'back', 'layed', 'front', 'good', 'look', 'retuning', 'tag', 'sewn', 'small', 'long', 'huge', 'x', 'itchy', 'cut', 'thread', 'left', 'behind', 'plasticy', 'even', 'itchy', 'make', 'item', 'itchy', 'tag', 'comfortable', 'also', 'love', 'bralettes', 'wear', 'time', 'including', 'work', 'b', 'cup', 'however', 'one', 'thin', 'flimsy', 'give', 'support', 'even', 'b', 'cup', 'would', 'lounging', 'bralette', 'wasnt', 'itchy', 'really', 'loved', 'top', 'online', 'wanted', 'love', 'person', 'soft', 'patter', 'okay', 'person', 'neckline', 'higher', 'used', 'also', 'two', 'button', 'back', 'must', 'unbuttoned', 'order', 'wear', 'top', 'difficult', 'button', 'behind', 'neck', 'top', 'unfortunately', 'return', 'item', 'really', 'wanted', 'work', 'ala', 'strange', 'fit', 'strap', 'would', 'stay', 'weird', 'fit', 'breast', 'worked', 'standing', 'minute', 'sat', 'fell', 'shoulder', 'fabric', 'beautiful', 'loved', 'pocket', 'designer', 'keep', 'making', 'crop', 'top', 'cant', 'imagine', 'would', 'flattering', 'anyone', 'especially', 'someone', 'average', 'height', 'well', 'endowed', 'top', 'looked', 'like', 'football', 'player', 'pattern', 'fabric', 'gorgeous', 'like', 'tall', 'super', 'tiny', 'fit', 'xx', 'may', 'work', 'fit', 'small', 'huge', 'almost', 'bigger', 'bottom', 'top', 'weird', 'cut', 'cami', 'underneath', 'ride', 'show', 'zipper', 'broke', 'piece', 'first', 'time', 'wore', 'disappointing', 'since', 'love', 'design', 'im', 'actually', 'going', 'try', 'replace', 'zipper', 'something', 'stronger', 'annoying', 'come', 'really', 'hoping', 'like', 'look', 'way', 'model', 'least', 'hem', 'much', 'pronounced', 'looser', 'one', 'photo', 'look', 'like', 'pinned', 'back', 'usually', 'wear', 'medium', 'large', 'got', 'medium', 'lot', 'material', 'bottom', 'half', 'photo', 'show', 'made', 'look', 'bigger', 'flattering', 'material', 'nice', 'weave', 'thin', 'delicate', 'bought', 'holly', 'deep', 'olive', 'blue', 'color', 'really', 'cute', 'piece', 'huge', 'ordered', 'xx', 'petite', 'unfortunately', 'extremely', 'wide', 'flattering', 'returning', 'usually', 'wear', 'medium', 'bought', 'small', 'fit', 'ok', 'shape', 'flattering', 'love', 'baby', 'doll', 'dress', 'top', 'tent', 'daughter', 'saw', 'try', 'said', 'thats', 'piece', 'tablecloth', 'going', 'back', 'excited', 'order', 'top', 'red', 'x', 'cute', 'huge', 'shapeless', 'support', 'thin', 'go', 'back', 'shouldve', 'looked', 'review', 'need', 'easy', 'comfortable', 'top', 'everyday', 'wear', 'bought', 'top', 'mostly', 'cute', 'button', 'received', 'looked', 'exactly', 'picture', 'online', 'however', 'button', 'kept', 'slipping', 'home', 'hole', 'slightly', 'big', 'shirt', 'fit', 'tad', 'snug', 'near', 'upper', 'arm', 'would', 'stretch', 'loosen', 'throughout', 'day', 'definitely', 'comfortable', 'shirt', 'felt', 'like', 'pajama', 'top', 'going', 'back', 'fabric', 'felt', 'cheap', 'didnt', 'find', 'flattering', 'top', 'reference', 'wearing', 'medium', 'photo', 'measurement', 'run', 'big', 'looked', 'unflattering', 'petite', 'might', 'work', 'someone', 'taller', 'thin', 'poor', 'quality', 'especially', 'price', 'felt', 'like', 'thin', 'pajama', 'top', 'button', 'terrible', 'little', 'shell', 'button', 'could', 'returned', 'faster', 'dress', 'quite', 'flattering', 'flirty', 'feminine', 'way', 'dress', 'received', 'new', 'color', 'faded', 'washed', 'red', 'black', 'stain', 'belt', 'area', 'tag', 'fabric', 'look', 'droopy', 'laundered', 'crisp', 'stiff', 'new', 'disappointed', 'quality', 'item', 'received', 'one', 'going', 'back', 'dear', 'retailer', 'please', 'make', 'sure', 'send', 'clothing', 'article', 'first', 'fabric', 'beautiful', 'lovely', 'spring', 'summer', 'really', 'wanted', 'like', 'top', 'fitting', 'awkward', 'typically', 'sized', 'shirt', 'size', 'tight', 'pulled', 'funny', 'across', 'chest', 'size', 'also', 'found', 'cut', 'shoulder', 'narrow', 'need', 'strapless', 'bra', 'made', 'look', 'unflattering', 'overall', 'return', 'one', 'back', 'store', 'excited', 'try', 'top', 'since', 'bargain', 'neutral', 'unfortunately', 'shape', 'line', 'accentuates', 'hip', 'area', 'bit', 'find', 'flattering', 'returned', 'tried', 'first', 'comment', 'take', 'scratchy', 'didnt', 'bother', 'trying', 'however', 'beautiful', 'sensitive', 'scratchy', 'material', 'fit', 'true', 'size', 'keeping', 'one', 'fabric', 'bit', 'person', 'cut', 'odd', 'fit', 'fine', 'snap', 'keep', 'neckline', 'flat', 'shaped', 'color', 'shown', 'good', 'length', 'fall', 'top', 'hip', 'simply', 'like', 'metallic', 'looking', 'maybe', 'disappointed', 'mention', 'suit', 'australian', 'size', 'ordered', 'im', 'usually', 'sold', 'suit', 'arrives', 'doesnt', 'even', 'fir', 'hip', 'label', 'clearly', 'say', 'australian', 'u', 'return', 'suit', 'sizing', 'issue', 'suit', 'look', 'well', 'made', 'design', 'adorable', 'title', 'say', 'fabric', 'top', 'best', 'worst', 'part', 'design', 'color', 'vibrant', 'combination', 'material', 'shoulder', 'knit', 'sweaterlike', 'navy', 'fabric', 'interesting', 'however', 'positive', 'comment', 'end', 'one', 'top', 'scratchy', 'stiff', 'frankly', 'uncomfortable', 'imagine', 'wanting', 'wear', 'could', 'benefit', 'lining', 'might', 'solved', 'problem', 'scratchy', 'itchy', 'fabric', 'stiff', 'fabric', 'bo', 'color', 'vivid', 'perfectly', 'fit', 'mess', 'overall', 'large', 'waistline', 'curve', 'front', 'fall', 'small', 'pleat', 'maternityish', 'waistband', 'thicker', 'dress', 'sat', 'away', 'body', 'material', 'poly', 'outer', 'dress', 'made', 'material', 'lining', 'would', 'liked', 'better', 'modesty', 'closure', 'plus', 'dress', 'already', 'unraveling', 'took', 'returned', 'online', 'look', 'like', 'great', 'sweater', 'ordered', 'xxsp', 'found', 'sweater', 'much', 'wider', 'middle', 'pictured', 'fact', 'im', 'pretty', 'sure', 'pinned', 'shirt', 'back', 'picture', 'make', 'appear', 'slimmer', 'unfortunately', 'sweater', 'work', 'hourglass', 'shape', 'shirt', 'make', 'look', 'pound', 'heavier', 'worried', 'item', 'ordered', 'look', 'picture', 'thinking', 'gone', 'gut', 'shirt', 'quality', 'retailer', 'purchase', 'seethrough', 'flimsy', 'bottom', 'like', 'picture', 'hang', 'odd', 'rumpled', 'way', 'top', 'flattering', 'though', 'shame', 'bottom', 'fit', 'nicer', 'like', 'product', 'could', 'gotten', 'away', 'wearing', 'cami', 'make', 'qu', 'torn', 'whether', 'return', 'ultimately', 'going', 'back', 'knit', 'thin', 'thought', 'would', 'cozy', 'normal', 'sweater', 'weight', 'light', 'swing', 'effect', 'doesnt', 'really', 'come', 'nothing', 'special', ...]
pos_words =" ".join(df[df["recommended_ind"] == 1].review_text).split()
pos_words
['absolutely', 'wonderful', 'silky', 'sexy', 'comfortable', 'love', 'dress', 'sooo', 'pretty', 'happened', 'find', 'store', 'im', 'glad', 'bc', 'never', 'would', 'ordered', 'online', 'bc', 'petite', 'bought', 'petite', 'love', 'length', 'hit', 'little', 'knee', 'would', 'definitely', 'true', 'midi', 'someone', 'truly', 'petite', 'love', 'love', 'love', 'jumpsuit', 'fun', 'flirty', 'fabulous', 'every', 'time', 'wear', 'get', 'nothing', 'great', 'compliment', 'shirt', 'flattering', 'due', 'adjustable', 'front', 'tie', 'perfect', 'length', 'wear', 'legging', 'sleeveless', 'pair', 'well', 'cardigan', 'love', 'shirt', 'basket', 'hte', 'last', 'see', 'would', 'look', 'like', 'person', 'store', 'pick', 'went', 'teh', 'color', 'pale', 'hte', 'color', 'really', 'gorgeous', 'turn', 'trying', 'little', 'baggy', 'hte', 'x', 'hte', 'size', 'bummer', 'petite', 'decided', 'though', 'said', 'everything', 'pant', 'skirt', 'trying', 'kept', 'oops', 'ordered', 'carbon', 'store', 'pick', 'ton', 'stuff', 'always', 'try', 'used', 'top', 'pair', 'skirt', 'pant', 'everything', 'went', 'color', 'really', 'nice', 'charcoal', 'shimmer', 'went', 'well', 'pencil', 'skirt', 'flare', 'pant', 'etc', 'bit', 'big', 'sleeve', 'long', 'doesnt', 'go', 'petite', 'also', 'bit', 'loose', 'xx', 'kept', 'wil', 'later', 'since', 'light', 'color', 'already', 'sold', 'hte', 'smallest', 'size', 'love', 'dress', 'usually', 'get', 'x', 'run', 'little', 'snug', 'bust', 'ordered', 'size', 'flattering', 'feminine', 'usual', 'retailer', 'flair', 'style', 'im', 'lb', 'ordered', 'petite', 'make', 'sure', 'length', 'wasnt', 'long', 'typically', 'wear', 'x', 'regular', 'retailer', 'dress', 'youre', 'le', 'busty', 'cup', 'smaller', 'petite', 'fit', 'perfectly', 'snug', 'tight', 'love', 'could', 'dress', 'party', 'work', 'love', 'tulle', 'longer', 'fabric', 'underneath', 'dress', 'perfection', 'pretty', 'flattering', 'find', 'review', 'written', 'savvy', 'shopper', 'past', 'right', 'product', 'case', 'doubt', 'would', 'even', 'tried', 'dress', 'beautifully', 'made', 'lined', 'reminiscent', 'old', 'retailer', 'quality', 'lined', 'solid', 'fabric', 'match', 'outer', 'fabric', 'print', 'tt', 'formfitting', 'fall', 'knee', 'rid', 'bought', 'black', 'x', 'go', 'midi', 'dress', 'didnt', 'bother', 'lining', 'skirt', 'portion', 'stats', 'x', 'fit', 'smoothly', 'around', 'chest', 'flowy', 'around', 'lower', 'half', 'would', 'say', 'running', 'big', 'strap', 'pretty', 'could', 'easily', 'im', 'came', 'knee', 'nice', 'choice', 'holiday', 'gathering', 'like', 'length', 'knee', 'conservative', 'enough', 'office', 'related', 'gathering', 'size', 'small', 'fit', 'well', 'usually', 'size', 'small', 'bust', 'opinion', 'run', 'small', 'larger', 'bust', 'definitely', 'size', 'perhaps', 'waist', 'big', 'problem', 'dress', 'quality', 'fabric', 'terrible', 'delicate', 'netting', 'type', 'fabric', 'top', 'layer', 'skirt', 'got', 'stuck', 'zip', 'took', 'package', 'wanted', 'fit', 'badly', 'could', 'tell', 'put', 'wouldnt', 'hourglass', 'figure', 'straight', 'waist', 'way', 'small', 'body', 'shape', 'even', 'sized', 'could', 'tell', 'would', 'still', 'tight', 'waist', 'roomy', 'hip', 'said', 'really', 'nice', 'sturdy', 'linenlike', 'fabric', 'pretty', 'color', 'well', 'made', 'hope', 'make', 'someone', 'happy', 'material', 'color', 'nice', 'leg', 'opening', 'large', 'length', 'hit', 'right', 'ankle', 'leg', 'opening', 'size', 'waist', 'hem', 'line', 'ankle', 'front', 'pleat', 'make', 'fluffy', 'think', 'imagine', 'flattering', 'look', 'least', 'average', 'height', 'taller', 'may', 'look', 'good', 'took', 'chance', 'blouse', 'glad', 'wasnt', 'crazy', 'blouse', 'photographed', 'model', 'paired', 'whit', 'white', 'pant', 'worked', 'perfectly', 'crisp', 'clean', 'would', 'describe', 'launders', 'well', 'fit', 'great', 'drape', 'perfect', 'wear', 'tucked', 'cant', 'go', 'wrong', 'flattering', 'super', 'cozy', 'coat', 'work', 'well', 'cold', 'dry', 'day', 'look', 'good', 'jean', 'dressier', 'outfit', 'small', 'fit', 'great', 'love', 'look', 'feel', 'tulle', 'dress', 'looking', 'something', 'different', 'top', 'new', 'year', 'eve', 'im', 'small', 'chested', 'top', 'dress', 'form', 'fitting', 'flattering', 'look', 'steamed', 'tulle', 'perfect', 'ordered', 'xsp', 'length', 'perfect', 'product', 'petite', 'would', 'get', 'petite', 'regular', 'little', 'long', 'tailor', 'simple', 'fix', 'fit', 'nicely', 'im', 'pregnant', 'bough', 'medium', 'grow', 'tie', 'front', 'back', 'provides', 'nice', 'flexibility', 'form', 'fitting', 'im', 'upset', 'price', 'dress', 'thought', 'embroidered', 'print', 'fabric', 'think', 'little', 'opened', 'box', 'still', 'ver', 'pretty', 'would', 'say', 'true', 'size', 'tad', 'bit', 'big', 'tiny', 'still', 'get', 'away', 'color', 'vibrant', 'style', 'unique', 'skirt', 'portion', 'pretty', 'poofy', 'keep', 'going', 'back', 'forth', 'mainly', 'price', 'although', 'quality', 'definitely', 'except', 'wish', 'cute', 'little', 'dress', 'fit', 'tt', 'little', 'high', 'waisted', 'good', 'length', 'height', 'like', 'dress', 'im', 'love', 'dont', 'think', 'look', 'feel', 'cheap', 'appears', 'pictured', 'love', 'shirt', 'first', 'saw', 'wasnt', 'sure', 'shirt', 'dress', 'since', 'seethrough', 'wear', 'like', 'dress', 'need', 'slip', 'wear', 'legging', 'bought', 'slip', 'wore', 'tie', 'back', 'white', 'wedge', 'could', 'also', 'wear', 'vest', 'careful', 'button', 'havent', 'fall', 'yet', 'feel', 'like', 'overall', 'great', 'occasion', 'fun', 'wear', 'color', 'werent', 'expected', 'either', 'dark', 'blue', 'much', 'vibrant', 'couldnt', 'find', 'anything', 'really', 'go', 'fabric', 'thick', 'good', 'quality', 'nice', 'weight', 'movement', 'skirt', 'wasnt', 'end', 'several', 'shirt', 'get', 'many', 'compliment', 'especially', 'one', 'say', 'dont', 'hesitate', 'buy', 'shirt', 'wont', 'sorry', 'sweater', 'comfy', 'classic', 'balance', 'quirky', 'handknit', 'look', 'beautiful', 'color', 'practical', 'fit', 'bit', 'cropped', 'boxy', 'part', 'style', 'others', 'mentioned', 'gap', 'knit', 'make', 'seethrough', 'opinion', 'make', 'perfect', 'layering', 'like', 'longer', 'camisole', 'showing', 'underneath', 'wearing', 'little', 'dress', 'warm', 'still', 'thin', 'enough', 'fit', 'jacket', 'coat', 'beautifully', 'made', 'pant', 'trend', 'flared', 'crop', 'much', 'cuter', 'person', 'love', 'never', 'would', 'given', 'pant', 'second', 'look', 'online', 'person', 'much', 'cuter', 'stripe', 'brighter', 'fit', 'flattering', 'crop', 'cute', 'flare', 'right', 'trend', 'brand', 'always', 'run', 'small', 'carry', 'belly', 'paired', 'loose', 'navy', 'blazer', 'pant', 'even', 'better', 'person', 'downside', 'need', 'dry', 'cleaned', 'neat', 'dress', 'color', 'great', 'fabric', 'super', 'soft', 'tall', 'long', 'length', 'added', 'bonus', 'definitely', 'need', 'something', 'underneath', 'since', 'front', 'gap', 'going', 'pair', 'funky', 'tank', 'top', 'necklace', 'boot', 'super', 'cute', 'wouldnt', 'given', 'second', 'look', 'tried', 'store', 'whim', 'love', 'love', 'comfortable', 'skirt', 'span', 'season', 'easily', 'exciting', 'design', 'good', 'work', 'skirt', 'paired', 'many', 'top', 'ordered', 'small', 'size', 'medium', 'mom', 'size', 'gorgeous', 'beautifully', 'draped', 'ill', 'need', 'houston', 'fall', 'winter', 'look', 'polished', 'snapped', 'ageappropriate', 'mom', 'look', 'amazing', 'skinny', 'jean', 'legging', 'ordered', 'gray', 'true', 'photo', 'super', 'cute', 'comfy', 'pull', 'sizing', 'accurate', 'material', 'little', 'bit', 'stretch', 'great', 'casual', 'top', 'flare', 'look', 'cute', 'grey', 'pilcro', 'stet', 'jean', 'flattering', 'peplum', 'back', 'nice', 'cut', 'shoulder', 'neckline', 'pretty', 'unique', 'great', 'jean', 'worn', 'work', 'slack', 'heel', 'color', 'print', 'embroidery', 'lovely', 'reasonably', 'priced', 'beautiful', 'top', 'unique', 'ordinary', 'bought', 'usual', 'medium', 'found', 'fit', 'tight', 'across', 'chest', 'although', 'baby', 'year', 'nursing', 'could', 'bought', 'would', 'size', 'poncho', 'cute', 'love', 'plaid', 'check', 'design', 'color', 'look', 'like', 'sorbet', 'cream', 'pair', 'well', 'turtleneck', 'jean', 'pencil', 'skirt', 'heel', 'love', 'look', 'fall', 'roll', 'right', 'spring', 'great', 'buy', 'first', 'thermal', 'naturally', 'didnt', 'expect', 'super', 'sheer', 'really', 'sheer', 'light', 'fabric', 'like', 'prepared', 'considering', 'might', 'run', 'walk', 'around', 'house', 'second', 'large', 'ordered', 'size', 'nice', 'oversized', 'fit', 'pound', 'wouldnt', 'want', 'go', 'larger', 'along', 'sheer', 'fabric', 'easily', 'stretched', 'dont', 'mind', 'case', 'finally', 'color', 'look', 'white', 'monitor', 'show', 'colorful', 'blue', 'dot', 'tried', 'today', 'local', 'retailer', 'comfortable', 'flattering', 'bad', 'picture', 'online', 'model', 'tucking', 'skirt', 'cant', 'see', 'ruching', 'across', 'front', 'little', 'dressier', 'alternative', 'plain', 'tee', 'reasonably', 'priced', 'retailer', 'generally', 'wear', 'small', 'fit', 'well', 'probably', 'back', 'black', 'bought', 'item', 'online', 'fit', 'model', 'looked', 'little', 'loose', 'got', 'mine', 'seemed', 'bit', 'tight', 'took', 'back', 'store', 'ordered', 'larger', 'size', 'sale', 'price', 'great', 'top', 'love', 'top', 'wear', 'time', 'problem', 'tell', 'wear', 'time', ...]
review_text = df["review_text"]
all_words = " ".join(review_text)
all_words[:100]
'absolutely wonderful silky sexy comfortable love dress sooo pretty happened find store im glad bc ne'
from wordcloud import WordCloud
wordcloud = WordCloud(background_color="white", max_words =250).generate(all_words)
plt.figure(figsize = (13, 13))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
wordcloud = WordCloud(background_color="white", max_words =250, colormap='gist_heat').generate(str(neg_words))
plt.figure(figsize = (13, 13))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
wordcloud = WordCloud(background_color="white", max_words =250, colormap='cool').generate(str(pos_words))
plt.figure(figsize = (13, 13))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
df.head()
| review_text | recommended_ind | |
|---|---|---|
| 0 | absolutely wonderful silky sexy comfortable | 1 |
| 1 | love dress sooo pretty happened find store im glad bc never would ordered online bc petite bought petite love length hit little knee would definitely true midi someone truly petite | 1 |
| 2 | high hope dress really wanted work initially ordered petite small usual size found small small fact could zip reordered petite medium ok overall top half comfortable fit nicely bottom half tight l... | 0 |
| 3 | love love love jumpsuit fun flirty fabulous every time wear get nothing great compliment | 1 |
| 4 | shirt flattering due adjustable front tie perfect length wear legging sleeveless pair well cardigan love shirt | 1 |
from sklearn.model_selection import train_test_split
X = df["review_text"]
y= df["recommended_ind"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=101)
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X_train_count = vectorizer.fit_transform(X_train)
X_test_count = vectorizer.transform(X_test)
type(X_train_count)
scipy.sparse._csr.csr_matrix
X_train_count.toarray()
array([[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
...,
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0]], dtype=int64)
vectorizer.get_feature_names_out()
array(['aa', 'ab', 'abby', ..., 'zipping', 'zone', 'zoom'], dtype=object)
pd.DataFrame(X_train_count.toarray(), columns = vectorizer.get_feature_names_out())
| aa | ab | abby | abdomen | ability | able | abo | abovetheknee | absolute | absolutely | absolutley | abstract | absurd | abt | abundance | ac | accent | accented | accentuate | accentuated | accentuates | accentuating | accept | acceptable | access | accessorize | accessorized | accessorizing | accessory | accident | accidental | accidentally | accommodate | accommodates | accommodating | accomodate | according | accordingly | account | accurate | accurately | achieve | across | acrylic | act | action | active | activewear | activity | actual | actuality | actually | ad | ada | add | added | addicted | adding | addition | additional | additionally | address | adequate | adequately | adjust | adjustable | adjusted | adjusting | adjustment | admire | admired | admiring | admit | admittedly | adn | ador | adorable | adore | adored | adult | advantage | advertised | advice | advise | advised | aesthetic | aesthetically | affair | affect | afford | affordable | afraid | afternoon | afterward | afterwards | ag | age | ageappropriate | aged | ago | ... | word | wore | work | workable | workappropriate | worked | working | workmanship | workout | workplace | workthe | world | worn | worried | worry | worrying | worse | worst | worth | worthwhile | worthy | wou | woul | would | wouldnt | wouldve | wound | woven | wow | wowed | wrap | wrapped | wrapping | wri | wring | wrinkle | wrinkled | wrinkling | wrinkly | wrist | write | writing | written | wrong | wrote | xl | xlarge | xmas | xsi | xsmall | xsmallsmall | xsp | xspetite | xssm | xtra | xx | xxl | xxsmall | xxsp | yarn | yay | yeah | year | yearold | yearround | yellow | yellowish | yelloworange | yellowy | yep | yes | yesterday | yet | yikes | yo | yoga | yoke | york | youd | youll | young | younger | youre | youthful | youve | yr | yuck | yucky | yummy | zag | zero | zig | zigzag | zip | zipped | zipper | zippered | zipping | zone | zoom | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18107 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 18108 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 18109 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 18110 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 18111 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
18112 rows × 5791 columns
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf_vectorizer = TfidfVectorizer()
X_train_tf_idf = tf_idf_vectorizer.fit_transform(X_train)
X_test_tf_idf = tf_idf_vectorizer.transform(X_test)
X_train_tf_idf.toarray()
array([[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
...,
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.]])
pd.DataFrame(X_train_tf_idf.toarray(), columns = tf_idf_vectorizer.get_feature_names_out())
| aa | ab | abby | abdomen | ability | able | abo | abovetheknee | absolute | absolutely | absolutley | abstract | absurd | abt | abundance | ac | accent | accented | accentuate | accentuated | accentuates | accentuating | accept | acceptable | access | accessorize | accessorized | accessorizing | accessory | accident | accidental | accidentally | accommodate | accommodates | accommodating | accomodate | according | accordingly | account | accurate | accurately | achieve | across | acrylic | act | action | active | activewear | activity | actual | actuality | actually | ad | ada | add | added | addicted | adding | addition | additional | additionally | address | adequate | adequately | adjust | adjustable | adjusted | adjusting | adjustment | admire | admired | admiring | admit | admittedly | adn | ador | adorable | adore | adored | adult | advantage | advertised | advice | advise | advised | aesthetic | aesthetically | affair | affect | afford | affordable | afraid | afternoon | afterward | afterwards | ag | age | ageappropriate | aged | ago | ... | word | wore | work | workable | workappropriate | worked | working | workmanship | workout | workplace | workthe | world | worn | worried | worry | worrying | worse | worst | worth | worthwhile | worthy | wou | woul | would | wouldnt | wouldve | wound | woven | wow | wowed | wrap | wrapped | wrapping | wri | wring | wrinkle | wrinkled | wrinkling | wrinkly | wrist | write | writing | written | wrong | wrote | xl | xlarge | xmas | xsi | xsmall | xsmallsmall | xsp | xspetite | xssm | xtra | xx | xxl | xxsmall | xxsp | yarn | yay | yeah | year | yearold | yearround | yellow | yellowish | yelloworange | yellowy | yep | yes | yesterday | yet | yikes | yo | yoga | yoke | york | youd | youll | young | younger | youre | youthful | youve | yr | yuck | yucky | yummy | zag | zero | zig | zigzag | zip | zipped | zipper | zippered | zipping | zone | zoom | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ... | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 1 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ... | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 2 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ... | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 3 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ... | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 4 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ... | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18107 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ... | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.18 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 18108 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ... | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.11 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 18109 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.29 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ... | 0.00 | 0.00 | 0.20 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 18110 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ... | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 18111 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.23 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ... | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
18112 rows × 5791 columns
from sklearn.metrics import confusion_matrix,classification_report, f1_score, recall_score, accuracy_score, precision_score
def eval(model, X_train, X_test):
y_pred = model.predict(X_test)
y_pred_train = model.predict(X_train)
print("Test_Set")
print(classification_report(y_test, y_pred))
print("Train_Set")
print(classification_report(y_train, y_pred_train))
fig, ax = plt.subplots(figsize=(8, 8))
plot_confusion_matrix(model, X_test, y_test, ax=ax)
from sklearn.linear_model import LogisticRegression
log = LogisticRegression(C =0.6, max_iter=1000, class_weight= "balanced", random_state=101)
log.fit(X_train_count,y_train)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[1], line 4 1 from sklearn.linear_model import LogisticRegression 3 log = LogisticRegression(C =0.6, max_iter=1000, class_weight= "balanced", random_state=101) ----> 4 log.fit(X_train_count,y_train) NameError: name 'X_train_count' is not defined
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
custom_scorer = {'accuracy': make_scorer(accuracy_score),
'precision-0': make_scorer(precision_score, pos_label=0),
'recall-0': make_scorer(recall_score, pos_label=0),
'f1-0': make_scorer(f1_score, pos_label=0),
'precision-1': make_scorer(precision_score, pos_label=1),
'recall-1': make_scorer(recall_score, pos_label=1),
'f1-1': make_scorer(f1_score, pos_label=1)
}
for i, j in custom_scorer.items():
model = LogisticRegression(C =0.6, max_iter=1000, class_weight= "balanced", random_state=101)
scores = cross_val_score(model, X_train_count, y_train, cv = 10, scoring = j).mean()
if i == "recall-1":
log_count_rec = scores
elif i == "f1-1":
log_count_f1 = scores
print(f" {i:20} score for count : {scores}\n")
accuracy score for count : 0.8663865231239555 precision-0 score for count : 0.6004732142128997 recall-0 score for count : 0.7860432574690488 f1-0 score for count : 0.6806600348422279 precision-1 score for count : 0.9492144073093828 recall-1 score for count : 0.8841613760989325 f1-1 score for count : 0.9155098938458213
from yellowbrick.classifier import PrecisionRecallCurve
viz = PrecisionRecallCurve(
LogisticRegression(C =0.6, max_iter=1000, class_weight= "balanced", random_state=101),
classes=log.classes_,
per_class=True,
cmap="Set1"
)
fig, ax = plt.subplots(figsize=(10, 6))
ax.set_facecolor('yellow')
viz.fit(X_train_count,y_train)
viz.score(X_test_count, y_test)
viz.show();
log_AP_count = viz.score_
log = LogisticRegression(C=0.1, max_iter=1000, random_state=101, class_weight="balanced")
log.fit(X_train_tf_idf,y_train)
LogisticRegression(C=0.1, class_weight='balanced', max_iter=1000,
random_state=101)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. LogisticRegression(C=0.1, class_weight='balanced', max_iter=1000,
random_state=101)print("LOG MODEL")
eval(log, X_train_tf_idf, X_test_tf_idf)
LOG MODEL
Test_Set
precision recall f1-score support
0 0.55 0.86 0.67 820
1 0.97 0.85 0.90 3709
accuracy 0.85 4529
macro avg 0.76 0.85 0.79 4529
weighted avg 0.89 0.85 0.86 4529
Train_Set
precision recall f1-score support
0 0.57 0.89 0.69 3281
1 0.97 0.85 0.91 14831
accuracy 0.86 18112
macro avg 0.77 0.87 0.80 18112
weighted avg 0.90 0.86 0.87 18112
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[137], line 3 1 print("LOG MODEL") ----> 3 eval(log, X_train_tf_idf, X_test_tf_idf) Cell In[130], line 11, in eval(model, X_train, X_test) 8 print(classification_report(y_train, y_pred_train)) 9 fig, ax = plt.subplots(figsize=(8, 8)) ---> 11 plot_confusion_matrix(model, X_test, y_test, ax=ax) NameError: name 'plot_confusion_matrix' is not defined
custom_scorer = {'accuracy': make_scorer(accuracy_score),
'precision-0': make_scorer(precision_score, pos_label=0),
'recall-0': make_scorer(recall_score, pos_label=0),
'f1-0': make_scorer(f1_score, pos_label=0),
'precision-1': make_scorer(precision_score, pos_label=1),
'recall-1': make_scorer(recall_score, pos_label=1),
'f1-1': make_scorer(f1_score, pos_label=1)
}
for i, j in custom_scorer.items():
LogisticRegression(C=0.1, max_iter=1000, random_state=101, class_weight="balanced")
scores = cross_val_score(model, X_train_tf_idf, y_train, cv = 10, scoring = j).mean()
if i == "recall-1":
log_tfidf_rec = scores
elif i == "f1-1":
log_tfidf_f1 = scores
print(f" {i:20} score for tfidf : {scores}\n")
accuracy score for tfidf : 0.860257465110808 precision-0 score for tfidf : 0.5784925099903739 recall-0 score for tfidf : 0.8463887241456002 f1-0 score for tfidf : 0.6870476202382443 precision-1 score for tfidf : 0.9621600784762144 recall-1 score for tfidf : 0.8633260510402712 f1-1 score for tfidf : 0.9100295215782153
viz = PrecisionRecallCurve(
LogisticRegression(C=0.1, max_iter=1000, random_state=101, class_weight="balanced"),
classes=log.classes_,
per_class=True,
cmap="Set1"
)
fig, ax = plt.subplots(figsize=(10, 6))
ax.set_facecolor('yellow')
viz.fit(X_train_tf_idf,y_train)
viz.score(X_test_tf_idf, y_test)
viz.show();
log_AP_tfidf = viz.score_
from sklearn.naive_bayes import MultinomialNB, BernoulliNB # BernoulliNB for binary model
nb = MultinomialNB()
nb.fit(X_train_count, y_train)
MultinomialNB()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
MultinomialNB()
print("NB MODEL")
eval(nb, X_train_count, X_test_count)
NB MODEL
Test_Set
precision recall f1-score support
0 0.67 0.74 0.70 820
1 0.94 0.92 0.93 3709
accuracy 0.89 4529
macro avg 0.80 0.83 0.82 4529
weighted avg 0.89 0.89 0.89 4529
Train_Set
precision recall f1-score support
0 0.70 0.81 0.75 3281
1 0.96 0.92 0.94 14831
accuracy 0.90 18112
macro avg 0.83 0.87 0.85 18112
weighted avg 0.91 0.90 0.91 18112
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[143], line 3 1 print("NB MODEL") ----> 3 eval(nb, X_train_count, X_test_count) Cell In[130], line 11, in eval(model, X_train, X_test) 8 print(classification_report(y_train, y_pred_train)) 9 fig, ax = plt.subplots(figsize=(8, 8)) ---> 11 plot_confusion_matrix(model, X_test, y_test, ax=ax) NameError: name 'plot_confusion_matrix' is not defined
custom_scorer = {'accuracy': make_scorer(accuracy_score),
'precision-0': make_scorer(precision_score, pos_label=0),
'recall-0': make_scorer(recall_score, pos_label=0),
'f1-0': make_scorer(f1_score, pos_label=0),
'precision-1': make_scorer(precision_score, pos_label=1),
'recall-1': make_scorer(recall_score, pos_label=1),
'f1-1': make_scorer(f1_score, pos_label=1)
}
for i, j in custom_scorer.items():
model = MultinomialNB()
scores = cross_val_score(model, X_train_count, y_train, cv = 10, scoring = j).mean()
if i == "recall-1":
nb_count_rec = scores
elif i == "f1-1":
nb_count_f1 = scores
print(f" {i:20} score for count : {scores}\n")
accuracy score for count : 0.8826189413968841 precision-0 score for count : 0.654342059232186 recall-0 score for count : 0.7476425235376973 f1-0 score for count : 0.6975979898377522 precision-1 score for count : 0.9423871525871338 recall-1 score for count : 0.9124805295596273 f1-1 score for count : 0.927165611564892
viz = PrecisionRecallCurve(
MultinomialNB(),
classes=nb.classes_,
per_class=True,
cmap="Set1"
)
fig, ax = plt.subplots(figsize=(10, 6))
ax.set_facecolor('yellow')
viz.fit(X_train_count,y_train)
viz.score(X_test_count, y_test)
viz.show();
nb_AP_count = viz.score_
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
nb = MultinomialNB()
nb.fit(X_train_tf_idf, y_train)
MultinomialNB()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
MultinomialNB()
print("NB MODEL")
eval(nb, X_train_tf_idf, X_test_tf_idf)
NB MODEL
Test_Set
precision recall f1-score support
0 0.87 0.17 0.28 820
1 0.84 0.99 0.91 3709
accuracy 0.84 4529
macro avg 0.85 0.58 0.60 4529
weighted avg 0.85 0.84 0.80 4529
Train_Set
precision recall f1-score support
0 0.95 0.22 0.36 3281
1 0.85 1.00 0.92 14831
accuracy 0.86 18112
macro avg 0.90 0.61 0.64 18112
weighted avg 0.87 0.86 0.82 18112
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[148], line 2 1 print("NB MODEL") ----> 2 eval(nb, X_train_tf_idf, X_test_tf_idf) Cell In[130], line 11, in eval(model, X_train, X_test) 8 print(classification_report(y_train, y_pred_train)) 9 fig, ax = plt.subplots(figsize=(8, 8)) ---> 11 plot_confusion_matrix(model, X_test, y_test, ax=ax) NameError: name 'plot_confusion_matrix' is not defined
custom_scorer = {'accuracy': make_scorer(accuracy_score),
'precision-0': make_scorer(precision_score, pos_label=0),
'recall-0': make_scorer(recall_score, pos_label=0),
'f1-0': make_scorer(f1_score, pos_label=0),
'precision-1': make_scorer(precision_score, pos_label=1),
'recall-1': make_scorer(recall_score, pos_label=1),
'f1-1': make_scorer(f1_score, pos_label=1)
}
for i, j in custom_scorer.items():
model = BernoulliNB()
scores = cross_val_score(model, X_train_tf_idf, y_train, cv = 10, scoring = j).mean()
if i == "recall-1":
nb_tfidf_rec = scores
elif i == "f1-1":
nb_tfidf_f1 = scores
print(f" {i:20} score for tfidf : {scores}\n")
accuracy score for tfidf : 0.8766563300312171 precision-0 score for tfidf : 0.6434549409966672 recall-0 score for tfidf : 0.7168618874638594 f1-0 score for tfidf : 0.6778544585841554 precision-1 score for tfidf : 0.9357830256315294 recall-1 score for tfidf : 0.9120087405692183 f1-1 score for tfidf : 0.9237148436630017
from yellowbrick.classifier import PrecisionRecallCurve
viz = PrecisionRecallCurve(
MultinomialNB(),
classes=nb.classes_,
per_class=True,
cmap="Set1"
)
fig, ax = plt.subplots(figsize=(10, 6))
ax.set_facecolor('yellow')
viz.fit(X_train_tf_idf, y_train)
viz.score(X_test_tf_idf, y_test)
viz.show();
nb_AP_tfidf = viz.score_
from sklearn.svm import LinearSVC
svc = LinearSVC(C=0.01, class_weight="balanced", random_state=101)
svc.fit(X_train_count,y_train)
LinearSVC(C=0.01, class_weight='balanced', random_state=101)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LinearSVC(C=0.01, class_weight='balanced', random_state=101)
print("SVC MODEL")
eval(svc, X_train_count, X_test_count)
SVC MODEL
Test_Set
precision recall f1-score support
0 0.60 0.84 0.70 820
1 0.96 0.87 0.92 3709
accuracy 0.87 4529
macro avg 0.78 0.86 0.81 4529
weighted avg 0.90 0.87 0.88 4529
Train_Set
precision recall f1-score support
0 0.65 0.92 0.76 3281
1 0.98 0.89 0.93 14831
accuracy 0.90 18112
macro avg 0.82 0.91 0.85 18112
weighted avg 0.92 0.90 0.90 18112
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[153], line 3 1 print("SVC MODEL") ----> 3 eval(svc, X_train_count, X_test_count) Cell In[130], line 11, in eval(model, X_train, X_test) 8 print(classification_report(y_train, y_pred_train)) 9 fig, ax = plt.subplots(figsize=(8, 8)) ---> 11 plot_confusion_matrix(model, X_test, y_test, ax=ax) NameError: name 'plot_confusion_matrix' is not defined
custom_scorer = {'accuracy': make_scorer(accuracy_score),
'precision-0': make_scorer(precision_score, pos_label=0),
'recall-0': make_scorer(recall_score, pos_label=0),
'f1-0': make_scorer(f1_score, pos_label=0),
'precision-1': make_scorer(precision_score, pos_label=1),
'recall-1': make_scorer(recall_score, pos_label=1),
'f1-1': make_scorer(f1_score, pos_label=1)
}
for i, j in custom_scorer.items():
model = LinearSVC(C=0.01, class_weight="balanced", random_state=101)
scores = cross_val_score(model, X_train_count, y_train, cv = 10, scoring = j).mean()
if i == "recall-1":
svc_count_rec = scores
elif i == "f1-1":
svc_count_f1 = scores
print(f" {i:20} score for count : {scores}\n")
accuracy score for count : 0.8649509741181862 precision-0 score for count : 0.590799663472119 recall-0 score for count : 0.8305396990140114 f1-0 score for count : 0.6902787096697695 precision-1 score for count : 0.9588319718226306 recall-1 score for count : 0.872563900304075 f1-1 score for count : 0.91363862996587
viz = PrecisionRecallCurve(
LinearSVC(C=0.01, class_weight="balanced", random_state=101),
classes=svc.classes_,
per_class=True,
cmap="Set1"
)
fig, ax = plt.subplots(figsize=(10, 6))
ax.set_facecolor('yellow')
viz.fit(X_train_count,y_train)
viz.score(X_test_count, y_test)
viz.show();
svc_AP_count = viz.score_
svc = LinearSVC(C=0.01, class_weight="balanced", random_state=101)
svc.fit(X_train_tf_idf, y_train)
LinearSVC(C=0.01, class_weight='balanced', random_state=101)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LinearSVC(C=0.01, class_weight='balanced', random_state=101)
print("SVC MODEL")
eval(svc, X_train_tf_idf, X_test_tf_idf)
SVC MODEL
Test_Set
precision recall f1-score support
0 0.55 0.87 0.67 820
1 0.97 0.84 0.90 3709
accuracy 0.85 4529
macro avg 0.76 0.86 0.79 4529
weighted avg 0.89 0.85 0.86 4529
Train_Set
precision recall f1-score support
0 0.57 0.89 0.69 3281
1 0.97 0.85 0.91 14831
accuracy 0.86 18112
macro avg 0.77 0.87 0.80 18112
weighted avg 0.90 0.86 0.87 18112
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[158], line 3 1 print("SVC MODEL") ----> 3 eval(svc, X_train_tf_idf, X_test_tf_idf) Cell In[130], line 11, in eval(model, X_train, X_test) 8 print(classification_report(y_train, y_pred_train)) 9 fig, ax = plt.subplots(figsize=(8, 8)) ---> 11 plot_confusion_matrix(model, X_test, y_test, ax=ax) NameError: name 'plot_confusion_matrix' is not defined
custom_scorer = {'accuracy': make_scorer(accuracy_score),
'precision-0': make_scorer(precision_score, pos_label=0),
'recall-0': make_scorer(recall_score, pos_label=0),
'f1-0': make_scorer(f1_score, pos_label=0),
'precision-1': make_scorer(precision_score, pos_label=1),
'recall-1': make_scorer(recall_score, pos_label=1),
'f1-1': make_scorer(f1_score, pos_label=1)
}
for i, j in custom_scorer.items():
model = LinearSVC(C=0.01, class_weight="balanced", random_state=101)
scores = cross_val_score(model, X_train_tf_idf, y_train, cv = 10, scoring = j).mean()
if i == "recall-1":
svc_tfidf_rec = scores
elif i == "f1-1":
svc_tfidf_f1 = scores
print(f" {i:20} score for tfidf : {scores}\n")
accuracy score for tfidf : 0.8430870703074053 precision-0 score for tfidf : 0.5428195619628373 recall-0 score for tfidf : 0.8576673585884796 f1-0 score for tfidf : 0.6646097767647368 precision-1 score for tfidf : 0.9639062389709894 recall-1 score for tfidf : 0.8398624664435934 f1-1 score for tfidf : 0.8975617321619218
viz = PrecisionRecallCurve(
LinearSVC(C=0.01, class_weight="balanced", random_state=101),
classes=svc.classes_,
per_class=True,
cmap="Set1"
)
fig, ax = plt.subplots(figsize=(10, 6))
ax.set_facecolor('yellow')
viz.fit(X_train_tf_idf,y_train)
viz.score(X_test_tf_idf, y_test)
viz.show();
svc_AP_tfidf = viz.score_
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(200, max_depth = 12, random_state = 42, n_jobs = -1, class_weight="balanced")
rf.fit(X_train_count, y_train)
RandomForestClassifier(class_weight='balanced', max_depth=12, n_estimators=200,
n_jobs=-1, random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. RandomForestClassifier(class_weight='balanced', max_depth=12, n_estimators=200,
n_jobs=-1, random_state=42)print("RF MODEL")
eval(rf, X_train_count, X_test_count)
RF MODEL
Test_Set
precision recall f1-score support
0 0.57 0.78 0.66 820
1 0.95 0.87 0.91 3709
accuracy 0.85 4529
macro avg 0.76 0.83 0.78 4529
weighted avg 0.88 0.85 0.86 4529
Train_Set
precision recall f1-score support
0 0.63 0.90 0.74 3281
1 0.97 0.88 0.93 14831
accuracy 0.89 18112
macro avg 0.80 0.89 0.83 18112
weighted avg 0.91 0.89 0.89 18112
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[163], line 3 1 print("RF MODEL") ----> 3 eval(rf, X_train_count, X_test_count) Cell In[130], line 11, in eval(model, X_train, X_test) 8 print(classification_report(y_train, y_pred_train)) 9 fig, ax = plt.subplots(figsize=(8, 8)) ---> 11 plot_confusion_matrix(model, X_test, y_test, ax=ax) NameError: name 'plot_confusion_matrix' is not defined
custom_scorer = {'accuracy': make_scorer(accuracy_score),
'precision-0': make_scorer(precision_score, pos_label=0),
'recall-0': make_scorer(recall_score, pos_label=0),
'f1-0': make_scorer(f1_score, pos_label=0),
'precision-1': make_scorer(precision_score, pos_label=1),
'recall-1': make_scorer(recall_score, pos_label=1),
'f1-1': make_scorer(f1_score, pos_label=1)
}
for i, j in custom_scorer.items():
model = RandomForestClassifier(200, max_depth = 12, random_state = 42, n_jobs = -1, class_weight="balanced")
scores = cross_val_score(model, X_train_count, y_train, cv = 10, scoring = j).mean()
if i == "recall-1":
rf_count_rec = scores
elif i == "f1-1":
rf_count_f1 = scores
print(f" {i:20} score for count : {scores}\n")
accuracy score for count : 0.8500434248393738 precision-0 score for count : 0.5623773261675364 recall-0 score for count : 0.7866502335236119 f1-0 score for count : 0.6554505796690441 precision-1 score for count : 0.9482590583977227 recall-1 score for count : 0.8640680633886655 f1-1 score for count : 0.9041336198854516
viz = PrecisionRecallCurve(
RandomForestClassifier(200, max_depth = 10, random_state = 42, n_jobs = -1, class_weight="balanced"),
classes=rf.classes_,
per_class=True,
cmap="Set1"
)
fig, ax = plt.subplots(figsize=(10, 6))
ax.set_facecolor('yellow')
viz.fit(X_train_tf_idf,y_train)
viz.score(X_test_tf_idf, y_test)
viz.show();
rf_AP_tfidf = viz.score_
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier(n_estimators= 500, random_state = 42)
ada.fit(X_train_count, y_train)
AdaBoostClassifier(n_estimators=500, random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
AdaBoostClassifier(n_estimators=500, random_state=42)
print("Ada MODEL")
eval(ada, X_train_count, X_test_count)
Ada MODEL
Test_Set
precision recall f1-score support
0 0.73 0.59 0.65 820
1 0.91 0.95 0.93 3709
accuracy 0.89 4529
macro avg 0.82 0.77 0.79 4529
weighted avg 0.88 0.89 0.88 4529
Train_Set
precision recall f1-score support
0 0.80 0.66 0.72 3281
1 0.93 0.96 0.94 14831
accuracy 0.91 18112
macro avg 0.86 0.81 0.83 18112
weighted avg 0.90 0.91 0.90 18112
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[168], line 3 1 print("Ada MODEL") ----> 3 eval(ada, X_train_count, X_test_count) Cell In[130], line 11, in eval(model, X_train, X_test) 8 print(classification_report(y_train, y_pred_train)) 9 fig, ax = plt.subplots(figsize=(8, 8)) ---> 11 plot_confusion_matrix(model, X_test, y_test, ax=ax) NameError: name 'plot_confusion_matrix' is not defined
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
custom_scorer = {'accuracy': make_scorer(accuracy_score),
'precision-0': make_scorer(precision_score, pos_label=0),
'recall-0': make_scorer(recall_score, pos_label=0),
'f1-0': make_scorer(f1_score, pos_label=0),
'precision-1': make_scorer(precision_score, pos_label=1),
'recall-1': make_scorer(recall_score, pos_label=1),
'f1-1': make_scorer(f1_score, pos_label=1)
}
for i, j in custom_scorer.items():
model = AdaBoostClassifier(n_estimators= 500, random_state = 42)
scores = cross_val_score(model, X_train_count, y_train, cv = 10, scoring = j).mean()
if i == "recall-1":
ada_count_rec = scores
elif i == "f1-1":
ada_count_f1 = scores
print(f" {i:20} score for count : {scores}\n")
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[14], line 23 21 for i, j in custom_scorer.items(): 22 model = AdaBoostClassifier(n_estimators= 500, random_state = 42) ---> 23 scores = cross_val_score(model, X_train_count, y_train, cv = 10, scoring = j).mean() 24 if i == "recall-1": 25 ada_count_rec = scores NameError: name 'X_train_count' is not defined
from sklearn.metrics import PrecisionRecallCurve
viz = PrecisionRecallCurve(
AdaBoostClassifier(n_estimators= 500, random_state = 42),
classes=ada.classes_,
per_class=True,
cmap="Set1"
)
fig, ax = plt.subplots(figsize=(10, 6))
ax.set_facecolor('yellow')
viz.fit(X_train_count,y_train)
viz.score(X_test_count, y_test)
viz.show();
--------------------------------------------------------------------------- ImportError Traceback (most recent call last) Cell In[15], line 1 ----> 1 from sklearn.metrics import PrecisionRecallCurve 2 viz = PrecisionRecallCurve( 3 AdaBoostClassifier(n_estimators= 500, random_state = 42), 4 classes=ada.classes_, 5 per_class=True, 6 cmap="Set1" 7 ) 9 fig, ax = plt.subplots(figsize=(10, 6)) ImportError: cannot import name 'PrecisionRecallCurve' from 'sklearn.metrics' (C:\ProgramData\anaconda3\Lib\site-packages\sklearn\metrics\__init__.py)
ada_AP_count = viz.score_
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[16], line 1 ----> 1 ada_AP_count = viz.score_ NameError: name 'viz' is not defined
ada = AdaBoostClassifier(n_estimators= 500, random_state = 42)
ada.fit(X_train_tf_idf, y_train)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[17], line 3 1 ada = AdaBoostClassifier(n_estimators= 500, random_state = 42) ----> 3 ada.fit(X_train_tf_idf, y_train) NameError: name 'X_train_tf_idf' is not defined
print("Ada MODEL")
eval(ada, X_train_tf_idf, X_test_tf_idf)
Ada MODEL
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[18], line 3 1 print("Ada MODEL") ----> 3 eval(ada, X_train_tf_idf, X_test_tf_idf) NameError: name 'X_train_tf_idf' is not defined
custom_scorer = {'accuracy': make_scorer(accuracy_score),
'precision-0': make_scorer(precision_score, pos_label=0),
'recall-0': make_scorer(recall_score, pos_label=0),
'f1-0': make_scorer(f1_score, pos_label=0),
'precision-1': make_scorer(precision_score, pos_label=1),
'recall-1': make_scorer(recall_score, pos_label=1),
'f1-1': make_scorer(f1_score, pos_label=1)
}
for i, j in custom_scorer.items():
model =AdaBoostClassifier(n_estimators= 500, random_state = 42)
scores = cross_val_score(model, X_train_tf_idf, y_train, cv = 10, scoring = j).mean()
if i == "recall-1":
ada_tfidf_rec = scores
elif i == "f1-1":
ada_tfidf_f1 = scores
print(f" {i:20} score for tfidf : {scores}\n")
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[19], line 12 10 for i, j in custom_scorer.items(): 11 model =AdaBoostClassifier(n_estimators= 500, random_state = 42) ---> 12 scores = cross_val_score(model, X_train_tf_idf, y_train, cv = 10, scoring = j).mean() 13 if i == "recall-1": 14 ada_tfidf_rec = scores NameError: name 'X_train_tf_idf' is not defined
viz = PrecisionRecallCurve(
AdaBoostClassifier(n_estimators= 500, random_state = 42),
classes=ada.classes_,
per_class=True,
cmap="Set1"
)
fig, ax = plt.subplots(figsize=(10, 6))
ax.set_facecolor('yellow')
viz.fit(X_train_tf_idf,y_train)
viz.score(X_test_tf_idf, y_test)
viz.show();
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[20], line 1 ----> 1 viz = PrecisionRecallCurve( 2 AdaBoostClassifier(n_estimators= 500, random_state = 42), 3 classes=ada.classes_, 4 per_class=True, 5 cmap="Set1" 6 ) 8 fig, ax = plt.subplots(figsize=(10, 6)) 9 ax.set_facecolor('yellow') NameError: name 'PrecisionRecallCurve' is not defined
ada_AP_tfidf = viz.score_
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[21], line 1 ----> 1 ada_AP_tfidf = viz.score_ NameError: name 'viz' is not defined
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU, Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
--------------------------------------------------------------------------- ModuleNotFoundError Traceback (most recent call last) Cell In[22], line 3 1 import numpy as np 2 import pandas as pd ----> 3 from tensorflow.keras.models import Sequential 4 from tensorflow.keras.layers import Dense, GRU, Embedding 5 from tensorflow.keras.optimizers import Adam ModuleNotFoundError: No module named 'tensorflow'
os.chdir('C:\\Users\\aksha\\OneDrive\\Desktop\\Placement\\Self Project\\NLP')
df0=pd.read_csv('Womens Clothing E-Commerce Reviews.csv')
df = df0.copy()
df.head()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[25], line 1 ----> 1 os.chdir('C:\\Users\\aksha\\OneDrive\\Desktop\\Placement\\Self Project\\NLP') 2 df0=pd.read_csv('Womens Clothing E-Commerce Reviews.csv') 3 df = df0.copy() NameError: name 'os' is not defined
df_dl = df_dl[["Review Text","Recommended IND"]]
df_dl.head()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[26], line 1 ----> 1 df_dl = df_dl[["Review Text","Recommended IND"]] 2 df_dl.head() NameError: name 'df_dl' is not defined
df_dl.shape
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[27], line 1 ----> 1 df_dl.shape NameError: name 'df_dl' is not defined
df_dl.dropna(inplace = True)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[28], line 1 ----> 1 df_dl.dropna(inplace = True) NameError: name 'df_dl' is not defined
df_dl.shape
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[29], line 1 ----> 1 df_dl.shape NameError: name 'df_dl' is not defined
X = df_dl['Review Text'].values
y = df_dl['Recommended IND'].values
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[30], line 1 ----> 1 X = df_dl['Review Text'].values 2 y = df_dl['Recommended IND'].values NameError: name 'df_dl' is not defined
tokenizer.fit_on_texts(X)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[31], line 1 ----> 1 tokenizer.fit_on_texts(X) NameError: name 'tokenizer' is not defined
tokenizer.word_index
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[32], line 1 ----> 1 tokenizer.word_index NameError: name 'tokenizer' is not defined
len(tokenizer.word_index)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[33], line 1 ----> 1 len(tokenizer.word_index) NameError: name 'tokenizer' is not defined
X_num_tokens = tokenizer.texts_to_sequences(X)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[34], line 1 ----> 1 X_num_tokens = tokenizer.texts_to_sequences(X) NameError: name 'tokenizer' is not defined
num_tokens = [len(tokens) for tokens in X_num_tokens]
num_tokens = np.array(num_tokens)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[35], line 1 ----> 1 num_tokens = [len(tokens) for tokens in X_num_tokens] 2 num_tokens = np.array(num_tokens) NameError: name 'X_num_tokens' is not defined
np.array(X_num_tokens)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[36], line 1 ----> 1 np.array(X_num_tokens) NameError: name 'X_num_tokens' is not defined
X[105]
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[37], line 1 ----> 1 X[105] NameError: name 'X' is not defined
print(X_num_tokens[105])
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[38], line 1 ----> 1 print(X_num_tokens[105]) NameError: name 'X_num_tokens' is not defined
tokenizer.word_index["shirt"]
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[39], line 1 ----> 1 tokenizer.word_index["shirt"] NameError: name 'tokenizer' is not defined
tokenizer.word_index["exactly"]
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[40], line 1 ----> 1 tokenizer.word_index["exactly"] NameError: name 'tokenizer' is not defined
num_tokens.mean()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[41], line 1 ----> 1 num_tokens.mean() NameError: name 'num_tokens' is not defined
num_tokens.max()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[42], line 1 ----> 1 num_tokens.max() NameError: name 'num_tokens' is not defined
num_tokens.argmax()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[43], line 1 ----> 1 num_tokens.argmax() NameError: name 'num_tokens' is not defined
X[16263]
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[44], line 1 ----> 1 X[16263] NameError: name 'X' is not defined
len(X[16263])
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[45], line 1 ----> 1 len(X[16263]) NameError: name 'X' is not defined
num_tokens.argmin()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[46], line 1 ----> 1 num_tokens.argmin() NameError: name 'num_tokens' is not defined
X[820]
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[47], line 1 ----> 1 X[820] NameError: name 'X' is not defined
len(X[820])
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[48], line 1 ----> 1 len(X[820]) NameError: name 'X' is not defined
len(X_num_tokens[105])
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[49], line 1 ----> 1 len(X_num_tokens[105]) NameError: name 'X_num_tokens' is not defined
np.array(X_num_tokens[105])
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[50], line 1 ----> 1 np.array(X_num_tokens[105]) NameError: name 'X_num_tokens' is not defined
len(X_num_tokens[106])
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[51], line 1 ----> 1 len(X_num_tokens[106]) NameError: name 'X_num_tokens' is not defined
np.array(X_num_tokens[106])
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[52], line 1 ----> 1 np.array(X_num_tokens[106]) NameError: name 'X_num_tokens' is not defined
num_tokens = [len(tokens) for tokens in X_num_tokens]
num_tokens = np.array(num_tokens)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[53], line 1 ----> 1 num_tokens = [len(tokens) for tokens in X_num_tokens] 3 num_tokens = np.array(num_tokens) NameError: name 'X_num_tokens' is not defined
num_tokens
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[54], line 1 ----> 1 num_tokens NameError: name 'num_tokens' is not defined
max_tokens = 103
sum(num_tokens < max_tokens) / len(num_tokens)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[56], line 1 ----> 1 sum(num_tokens < max_tokens) / len(num_tokens) NameError: name 'num_tokens' is not defined
sum(num_tokens < max_tokens) # the number of documents which have 103 or less tokens
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[57], line 1 ----> 1 sum(num_tokens < max_tokens) NameError: name 'num_tokens' is not defined
len(num_tokens) # total number of all documents in corpus which is constrained by num_words as 20000
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[58], line 1 ----> 1 len(num_tokens) NameError: name 'num_tokens' is not defined
X_pad = pad_sequences(X_num_tokens, maxlen=max_tokens)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[59], line 1 ----> 1 X_pad = pad_sequences(X_num_tokens, maxlen=max_tokens) NameError: name 'pad_sequences' is not defined
X_pad.shape
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[60], line 1 ----> 1 X_pad.shape NameError: name 'X_pad' is not defined
X_pad[105]
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[61], line 1 ----> 1 X_pad[105] NameError: name 'X_pad' is not defined
X_pad[106]
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[62], line 1 ----> 1 X_pad[106] NameError: name 'X_pad' is not defined
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, stratify=y, random_state=101)
# we have been using stratify to prevent imbalance.
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[63], line 2 1 from sklearn.model_selection import train_test_split ----> 2 X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, stratify=y, random_state=101) NameError: name 'X_pad' is not defined
model = Sequential()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[64], line 1 ----> 1 model = Sequential() NameError: name 'Sequential' is not defined
embedding_size = 100
model.add(Embedding(input_dim=num_words,
output_dim=embedding_size,
input_length=max_tokens,
name='embedding_layer'))
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) Cell In[66], line 1 ----> 1 model.add(Embedding(input_dim=num_words, 2 output_dim=embedding_size, 3 input_length=max_tokens, 4 name='embedding_layer')) AttributeError: 'AdaBoostClassifier' object has no attribute 'add'
model.add(GRU(units=48, return_sequences=True))
model.add(GRU(units=24, return_sequences=True))
model.add(GRU(units=12))
model.add(Dense(1, activation='sigmoid'))
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) Cell In[67], line 1 ----> 1 model.add(GRU(units=48, return_sequences=True)) 2 model.add(GRU(units=24, return_sequences=True)) 3 model.add(GRU(units=12)) AttributeError: 'AdaBoostClassifier' object has no attribute 'add'
optimizer = Adam(learning_rate=0.006)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[68], line 1 ----> 1 optimizer = Adam(learning_rate=0.006) NameError: name 'Adam' is not defined
model.compile(loss='binary_crossentropy',
optimizer=optimizer,
metrics=['Recall'])
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) Cell In[69], line 1 ----> 1 model.compile(loss='binary_crossentropy', 2 optimizer=optimizer, 3 metrics=['Recall']) AttributeError: 'AdaBoostClassifier' object has no attribute 'compile'
model.summary()
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) Cell In[70], line 1 ----> 1 model.summary() AttributeError: 'AdaBoostClassifier' object has no attribute 'summary'
from tensorflow.keras.callbacks import EarlyStopping
early_stop = EarlyStopping(monitor="val_loss", mode="auto",
verbose=1, patience = 10, restore_best_weights=True)
--------------------------------------------------------------------------- ModuleNotFoundError Traceback (most recent call last) Cell In[71], line 1 ----> 1 from tensorflow.keras.callbacks import EarlyStopping 3 early_stop = EarlyStopping(monitor="val_loss", mode="auto", 4 verbose=1, patience = 10, restore_best_weights=True) ModuleNotFoundError: No module named 'tensorflow'
pd.Series(y_train).value_counts(normalize=True)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[72], line 1 ----> 1 pd.Series(y_train).value_counts(normalize=True) NameError: name 'y_train' is not defined
weights = {0:82, 1:18}
model.fit(X_train, y_train, epochs=30, batch_size=256, class_weight=weights,
validation_data=(X_test, y_test), callbacks=[early_stop])
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[74], line 1 ----> 1 model.fit(X_train, y_train, epochs=30, batch_size=256, class_weight=weights, 2 validation_data=(X_test, y_test), callbacks=[early_stop]) NameError: name 'X_train' is not defined
model.save('NLP_Sentiment_Analysis_Project')
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) Cell In[75], line 1 ----> 1 model.save('NLP_Sentiment_Analysis_Project') AttributeError: 'AdaBoostClassifier' object has no attribute 'save'
model_loss = pd.DataFrame(model.history.history)
model_loss.head()
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) Cell In[76], line 1 ----> 1 model_loss = pd.DataFrame(model.history.history) 2 model_loss.head() AttributeError: 'AdaBoostClassifier' object has no attribute 'history'
model_loss.plot();
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[77], line 1 ----> 1 model_loss.plot() NameError: name 'model_loss' is not defined
model.evaluate(X_train, y_train)
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) Cell In[78], line 1 ----> 1 model.evaluate(X_train, y_train) AttributeError: 'AdaBoostClassifier' object has no attribute 'evaluate'
model.evaluate(X_test, y_test)
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) Cell In[79], line 1 ----> 1 model.evaluate(X_test, y_test) AttributeError: 'AdaBoostClassifier' object has no attribute 'evaluate'
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score, roc_auc_score
y_train_pred = (model.predict(X_train) >= 0.5).astype("int32")
print(confusion_matrix(y_train, y_train_pred))
print("-------------------------------------------------------")
print(classification_report(y_train, y_train_pred))
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[80], line 3 1 from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score, roc_auc_score ----> 3 y_train_pred = (model.predict(X_train) >= 0.5).astype("int32") 5 print(confusion_matrix(y_train, y_train_pred)) 6 print("-------------------------------------------------------") NameError: name 'X_train' is not defined
y_pred = (model.predict(X_test) >= 0.5).astype("int32")
print(confusion_matrix(y_test, y_pred))
print("-------------------------------------------------------")
print(classification_report(y_test, y_pred))
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[81], line 1 ----> 1 y_pred = (model.predict(X_test) >= 0.5).astype("int32") 3 print(confusion_matrix(y_test, y_pred)) 4 print("-------------------------------------------------------") NameError: name 'X_test' is not defined
from sklearn.metrics import precision_recall_curve, average_precision_score
y_pred_proba = model.predict(X_test)
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
# plt.plot([1, 0], [0, 1],'k--')
plt.plot(precision, recall)
plt.xlabel('precision')
plt.ylabel('recall')
plt.title('Precision Recall Curve')
plt.show()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[82], line 3 1 from sklearn.metrics import precision_recall_curve, average_precision_score ----> 3 y_pred_proba = model.predict(X_test) 4 precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba) 6 # plt.plot([1, 0], [0, 1],'k--') NameError: name 'X_test' is not defined
from sklearn.metrics import precision_recall_curve, average_precision_score, recall_score
DL_AP = average_precision_score(y_test, y_pred_proba)
DL_f1 = f1_score(y_test, y_pred)
DL_rec = recall_score(y_test, y_pred)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[83], line 3 1 from sklearn.metrics import precision_recall_curve, average_precision_score, recall_score ----> 3 DL_AP = average_precision_score(y_test, y_pred_proba) 4 DL_f1 = f1_score(y_test, y_pred) 5 DL_rec = recall_score(y_test, y_pred) NameError: name 'y_test' is not defined
review1 = "Love this dress"
review2 = "Absolutely wonderful. silky and sexy and comfortable"
review3 = "i initially ordered the petite small (my usual size) but i found this to be outrageously small. so small in fact that i could not zip it up!"
review4 = "I love, love, love this jumpsuit. it's fun, flirty, and fabulous! every time i wear it, i get nothing but great compliments!"
review5 = 'This shirt is very flattering to all due to the adjustable front tie. it is the perfect length to wear with leggings and it is sleeveless so it pairs well with any cardigan. love this shirt!!!'
review6 = 'I love tracy reese dresses, but this one is not for the very petite. i am just under 5 feet tall and usually wear a 0p in this brand. this dress was very pretty out of the package but its a lot of dress.'
review7 = 'I love this dress. i usually get an xs but it runs a little snug in bust so i ordered up a size. very flattering and feminine with the usual retailer flair for style.'
review8 = 'Dress runs small esp where the zipper area runs. i ordered the sp which typically fits me and it was very tight! the material on the top looks and feels very cheap that even just pulling on it will cause it to rip the fabric. pretty disappointed as it was going to be my christmas dress this year! needless to say it will be going back.'
review9 = "if you are at least average height or taller, this may look good on you."
review10 = "sadly will be returning, but i'm sure i will find something to exchange it for!"
review11 = "Cute little dress fits tts. it is a little high waisted. good length for my 5'9 height. i like the dress, i'm just not in love with it. i dont think it looks or feels cheap. it appears just as pictured."
review12 = 'Loved the material, but i didnt really look at how long the dress was before i purchased both a large and a medium. im 5\'5" and there was atleast 5" of material at my feet. the gaps in the front are much wider than they look. felt like the dress just fell flat. both were returned. im usually a large and the med fit better. 36d 30 in jeans'
review13 = "I have been waiting for this sweater coat to ship for weeks and i was so excited for it to arrive. this coat is not true to size and made me look short and squat."
review14 = 'Very comfortable, material is good, cut out on sleeves flattering'
reviews = [review1, review2, review3, review4, review5, review6, review7, review8, review9, review10, review11, review12, review13, review14]
tokens = tokenizer.texts_to_sequences(reviews)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[85], line 1 ----> 1 tokens = tokenizer.texts_to_sequences(reviews) NameError: name 'tokenizer' is not defined
tokens_pad = pad_sequences(tokens, maxlen=max_tokens)
tokens_pad.shape
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[86], line 1 ----> 1 tokens_pad = pad_sequences(tokens, maxlen=max_tokens) 2 tokens_pad.shape NameError: name 'pad_sequences' is not defined
mod_pred = model.predict(tokens_pad)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[87], line 1 ----> 1 mod_pred = model.predict(tokens_pad) NameError: name 'tokens_pad' is not defined
mod_pred
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[88], line 1 ----> 1 mod_pred NameError: name 'mod_pred' is not defined
df_pred = pd.DataFrame(mod_pred, index=reviews)
df_pred.rename(columns={0: 'Pred_Proba'}, inplace=True)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[89], line 1 ----> 1 df_pred = pd.DataFrame(mod_pred, index=reviews) 2 df_pred.rename(columns={0: 'Pred_Proba'}, inplace=True) NameError: name 'mod_pred' is not defined
df_pred["Predicted_Feedbaack"] = df_pred["Pred_Proba"].apply(lambda x: "Recommended" if x>=0.5 else "Not Recommended")
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[90], line 1 ----> 1 df_pred["Predicted_Feedbaack"] = df_pred["Pred_Proba"].apply(lambda x: "Recommended" if x>=0.5 else "Not Recommended") NameError: name 'df_pred' is not defined
df_pred
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[91], line 1 ----> 1 df_pred NameError: name 'df_pred' is not defined
compare = pd.DataFrame({"Model": ["NaiveBayes_count", "LogReg_count", "SVM_count", "Random Forest_count",
"AdaBoost_count", "NaiveBayes_tfidf", "LogReg_tfidf", "SVM_tfidf",
"Random Forest_tfidf", "AdaBoost_tfidf", "DL"],
"F1_Score": [nb_count_f1, log_count_f1, svc_count_f1,
rf_count_f1, ada_count_f1, nb_tfidf_f1, log_tfidf_f1,
svc_tfidf_f1, rf_tfidf_f1, ada_tfidf_f1, DL_f1],
"Recall_Score": [nb_count_rec, log_count_rec, svc_count_rec,
rf_count_rec, ada_count_rec,
nb_tfidf_rec, log_tfidf_rec, svc_tfidf_rec,
rf_tfidf_rec, ada_tfidf_rec, DL_rec],
"Average_Precision_Score": [nb_AP_count, log_AP_count, svc_AP_count, rf_AP_count,
ada_AP_count, nb_AP_tfidf, log_AP_tfidf, svc_AP_tfidf,
rf_AP_tfidf, ada_AP_tfidf, DL_AP]})
def labels(ax):
for p in ax.patches:
width = p.get_width() # get bar length
ax.text(width, # set the text at 1 unit right of the bar
p.get_y() + p.get_height() / 2, # get Y coordinate + X coordinate / 2
'{:1.3f}'.format(width), # set variable to display, 2 decimals
ha = 'left', # horizontal alignment
va = 'center') # vertical alignment
plt.figure(figsize=(15,30))
plt.subplot(311)
compare = compare.sort_values(by="Recall_Score", ascending=False)
ax=sns.barplot(x="Recall_Score", y="Model", data=compare, palette="Blues_d")
labels(ax)
plt.subplot(312)
compare = compare.sort_values(by="F1_Score", ascending=False)
ax=sns.barplot(x="F1_Score", y="Model", data=compare, palette="Blues_d")
labels(ax)
plt.subplot(313)
compare = compare.sort_values(by="Average_Precision_Score", ascending=False)
ax=sns.barplot(x="Average_Precision_Score", y="Model", data=compare, palette="Blues_d")
labels(ax)
plt.show();
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[92], line 5 1 compare = pd.DataFrame({"Model": ["NaiveBayes_count", "LogReg_count", "SVM_count", "Random Forest_count", 2 "AdaBoost_count", "NaiveBayes_tfidf", "LogReg_tfidf", "SVM_tfidf", 3 "Random Forest_tfidf", "AdaBoost_tfidf", "DL"], 4 ----> 5 "F1_Score": [nb_count_f1, log_count_f1, svc_count_f1, 6 rf_count_f1, ada_count_f1, nb_tfidf_f1, log_tfidf_f1, 7 svc_tfidf_f1, rf_tfidf_f1, ada_tfidf_f1, DL_f1], 8 9 "Recall_Score": [nb_count_rec, log_count_rec, svc_count_rec, 10 rf_count_rec, ada_count_rec, 11 nb_tfidf_rec, log_tfidf_rec, svc_tfidf_rec, 12 rf_tfidf_rec, ada_tfidf_rec, DL_rec], 13 14 "Average_Precision_Score": [nb_AP_count, log_AP_count, svc_AP_count, rf_AP_count, 15 ada_AP_count, nb_AP_tfidf, log_AP_tfidf, svc_AP_tfidf, 16 rf_AP_tfidf, ada_AP_tfidf, DL_AP]}) 18 def labels(ax): 20 for p in ax.patches: NameError: name 'nb_count_f1' is not defined